diff --git a/lib/gc/Transforms/GPU/AllocsToSLM.cpp b/lib/gc/Transforms/GPU/AllocsToSLM.cpp index d3abf26c..46ec2a4a 100644 --- a/lib/gc/Transforms/GPU/AllocsToSLM.cpp +++ b/lib/gc/Transforms/GPU/AllocsToSLM.cpp @@ -8,6 +8,7 @@ #include "gc/Transforms/Passes.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/TransformOps/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -80,22 +81,16 @@ struct ConvertAlloc : public OpRewritePattern { return rewriter.notifyMatchFailure( allocOp, "Only support constant block sizes for now"); - int64_t xI = xSz.value(); - int64_t yI = ySz.value(); - int64_t zI = zSz.value(); - - if (zI != 1) - return rewriter.notifyMatchFailure( - allocOp, "Only support 2D shared memory for now"); - + int64_t blockSizes[3] = {xSz.value(), ySz.value(), zSz.value()}; MemRefType originalMemRefType = cast(memref.getType()); auto originalShape = originalMemRefType.getShape(); - // Scale the allocation size by the number of threads in the work-group - int64_t newX = originalShape[0] * xI; - int64_t newY = originalShape[1] * yI; - - SmallVector newShape = {newX, newY}; + // Scale the allocation size (X dimension) by the number of threads in the + // work-group + int64_t newX = + originalShape[0] * blockSizes[0] * blockSizes[1] * blockSizes[2]; + SmallVector newShape({newX}); + newShape.append(originalShape.begin() + 1, originalShape.end()); IntegerAttr sharedAddressSpace = IntegerAttr::get(rewriter.getIntegerType(64), @@ -111,27 +106,29 @@ struct ConvertAlloc : public OpRewritePattern { allocOp.getOperands()) .getResult(); - // Compute the offsets in SLM chunk for the current thread - auto origXConst = rewriter.create(allocOp.getLoc(), - originalShape[0]); - auto origYConst = rewriter.create(allocOp.getLoc(), - originalShape[1]); + // Compute the offsets in SLM chunk for the current thread: + // X_off = (Xthr_i * Ybl_sz * Zbl_sz + Ythr_i * Zbl_sz + Zthr_i) * Xchunk_sz + // Offsets for other dimensions = 0 + auto xI = getAffineDimExpr(0, rewriter.getContext()); + auto yI = getAffineDimExpr(1, rewriter.getContext()); + auto zI = getAffineDimExpr(2, rewriter.getContext()); + auto idxExpr = + (xI * blockSizes[1] * blockSizes[2] + yI * blockSizes[2] + zI) * + originalShape[0]; + auto idxMap = AffineMap::get(/*dimCount=*/3, /*symbolCount=*/0, idxExpr); auto threadIds = launchOp.getThreadIds(); + auto offX = rewriter.create( + allocOp.getLoc(), idxMap, + /*exprOperands=*/ValueRange({threadIds.x, threadIds.y, threadIds.z})); - auto offX = - rewriter - .create(allocOp.getLoc(), threadIds.x, origXConst) - .getResult(); - auto offY = - rewriter - .create(allocOp.getLoc(), threadIds.y, origYConst) - .getResult(); + SmallVector staticOffsets({ShapedType::kDynamic}); + staticOffsets.insert(staticOffsets.end(), originalShape.size() - 1, 0); - auto offsets = getMixedValues({ShapedType::kDynamic, ShapedType::kDynamic}, - {offX, offY}, rewriter); + auto offsets = getMixedValues(staticOffsets, {offX}, rewriter); auto sizes = getMixedValues(originalShape, {}, rewriter); - auto strides = getMixedValues({1, 1}, {}, rewriter); + auto strides = getMixedValues(SmallVector(originalShape.size(), 1), + {}, rewriter); auto newSlice = rewriter diff --git a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp index 3f4cb514..344261d4 100644 --- a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp +++ b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp @@ -62,6 +62,28 @@ static Value createFullMask(PatternRewriter &rewriter, Location loc, return res.getResult(); } +// Extracts the offsets from a subview operation as values. +// The differense from mlir::getMixedOffsets is that this function +// returns the offsets as mlir::Value that can already be used as an argument +// for other mlir::Operations. +static SmallVector extractOffsetsAsValues(PatternRewriter &rewriter, + Location loc, + memref::SubViewOp subview) { + SmallVector offsetValues; + auto staticOffsets = subview.getStaticOffsets(); + auto dynamicOffsets = subview.getOffsets(); + size_t dynIdx = 0; + for (size_t i = 0; i < staticOffsets.size(); i++) { + if (staticOffsets[i] == ShapedType::kDynamic) + offsetValues.push_back(dynamicOffsets[dynIdx++]); + else + offsetValues.push_back( + rewriter.create(loc, staticOffsets[i])); + } + + return offsetValues; +} + // Max number of elements to load/store from SLM constexpr int64_t maxSLMTileSize = 32; @@ -841,8 +863,11 @@ static SmallVector createSLMDescTiles(PatternRewriter &rewriter, // GPU kernel. We have to merge the subview offsets into the descriptor // offset. if (auto subView = dyn_cast(src.getDefiningOp())) { - auto xIntOffs = subView.getOffsets()[0]; - auto yIntOffs = subView.getOffsets()[1]; + auto offsets = extractOffsetsAsValues(rewriter, loc, subView); + assert(offsets.size() == 2 && "Expected 2D subview offsets"); + + auto xIntOffs = offsets[0]; + auto yIntOffs = offsets[1]; // compute 'blockOffset' (beginning of the subview block in the original // flat memref) diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-1d.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-1d.mlir new file mode 100644 index 00000000..302267da --- /dev/null +++ b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-1d.mlir @@ -0,0 +1,43 @@ +// RUN: gc-opt %s --allocs-to-slm | FileCheck %s + +// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size +// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 256)> + +func.func @entry() { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + + // Memory space wasn't assigned as it's allocated outside of gpu.launch block + // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<256xf16> + %0 = memref.alloc() : memref<256xf16> + // Capture thread-id variables + // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads + // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in + // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) { + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) { + // Memory space was changed as it's explicitly specifided + // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<256xf16, 1> + %1 = memref.alloc() : memref<256xf16, 1> + // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 256 = 6144) + // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<6144xf16, 3> + // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]]) + // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]]] [256] [1] + // CHECK-SAME: memref<6144xf16, 3> to memref<256xf16, strided<[1], offset: ?>, 3> + %2 = memref.alloc() : memref<256xf16> + + // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] : + // CHECK-SAME: memref<256xf16, 1>, memref<256xf16, strided<[1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<256xf16>) + linalg.add ins(%1, %2 :memref<256xf16, 1>, memref<256xf16>) outs(%0 : memref<256xf16>) + // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<256xf16, 1> + // Verify that there are no deallocs for SLM + // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .* + // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .* + memref.dealloc %1 : memref<256xf16, 1> + memref.dealloc %2 : memref<256xf16> + gpu.terminator + } + return +} diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-2d.mlir similarity index 63% rename from test/mlir/test/gc/Transforms/GPU/allocs-to-slm.mlir rename to test/mlir/test/gc/Transforms/GPU/allocs-to-slm-2d.mlir index 2bcf44f3..b41fab1f 100644 --- a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm.mlir +++ b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-2d.mlir @@ -1,8 +1,12 @@ // RUN: gc-opt %s --allocs-to-slm | FileCheck %s +// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size +// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 16)> + func.func @entry() { %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index // Memory space wasn't assigned as it's allocated outside of gpu.launch block @@ -10,23 +14,22 @@ func.func @entry() { %0 = memref.alloc() : memref<16x32xf16> // Capture thread-id variables // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads - // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[ARG5:.+]]) in - // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c4, %[[ARG11:.+]] = %c1) { + // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in + // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) { gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1) - threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c4, %sz_tz = %c1) { + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) { // Memory space was changed as it's explicitly specifided // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x32xf16, 1> %1 = memref.alloc() : memref<16x32xf16, 1> - // Added 'shared' memory space - // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<32x128xf16, 3> - // CHECK: %[[OFF_X:.*]] = arith.muli %[[THREAD_X]], %c16 : index - // CHECK: %[[OFF_Y:.*]] = arith.muli %[[THREAD_Y]], %c32 : index - // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], %[[OFF_Y]]] [16, 32] [1, 1] - // CHECK-SAME: memref<32x128xf16, 3> to memref<16x32xf16, strided<[128, 1], offset: ?>, 3> + // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 16 = 384) + // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<384x32xf16, 3> + // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]]) + // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], 0] [16, 32] [1, 1] + // CHECK-SAME: memref<384x32xf16, 3> to memref<16x32xf16, strided<[32, 1], offset: ?>, 3> %2 = memref.alloc() : memref<16x32xf16> // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] : - // CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[128, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>) + // CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[32, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>) linalg.add ins(%1, %2 :memref<16x32xf16, 1>, memref<16x32xf16>) outs(%0 : memref<16x32xf16>) // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x32xf16, 1> // Verify that there are no deallocs for SLM diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-4d.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-4d.mlir new file mode 100644 index 00000000..01659795 --- /dev/null +++ b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-4d.mlir @@ -0,0 +1,43 @@ +// RUN: gc-opt %s --allocs-to-slm | FileCheck %s + +// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size +// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 2)> + +func.func @entry() { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + + // Memory space wasn't assigned as it's allocated outside of gpu.launch block + // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<2x3x16x32xf16> + %0 = memref.alloc() : memref<2x3x16x32xf16> + // Capture thread-id variables + // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads + // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in + // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) { + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) { + // Memory space was changed as it's explicitly specifided + // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<2x3x16x32xf16, 1> + %1 = memref.alloc() : memref<2x3x16x32xf16, 1> + // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 2 = 48) + // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<48x3x16x32xf16, 3> + // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]]) + // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], 0, 0, 0] [2, 3, 16, 32] [1, 1, 1, 1] + // CHECK-SAME: memref<48x3x16x32xf16, 3> to memref<2x3x16x32xf16, strided<[1536, 512, 32, 1], offset: ?>, 3> + %2 = memref.alloc() : memref<2x3x16x32xf16> + + // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] : + // CHECK-SAME: memref<2x3x16x32xf16, 1>, memref<2x3x16x32xf16, strided<[1536, 512, 32, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<2x3x16x32xf16>) + linalg.add ins(%1, %2 :memref<2x3x16x32xf16, 1>, memref<2x3x16x32xf16>) outs(%0 : memref<2x3x16x32xf16>) + // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<2x3x16x32xf16, 1> + // Verify that there are no deallocs for SLM + // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .* + // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .* + memref.dealloc %1 : memref<2x3x16x32xf16, 1> + memref.dealloc %2 : memref<2x3x16x32xf16> + gpu.terminator + } + return +}