Skip to content

Commit faf553a

Browse files
lialanYour Name
authored andcommitted
[GPU] Add in_bounds attribute to CoalescedGatherDMAOp for tensor.pad fusion
Add support for fusing tensor.pad into coalesced_gather_dma when the copy source is a padded tensor. This enables DMA operations to read directly from global memory (fat_raw_buffer) instead of creating private memory allocations for padded data. Key changes: * Add optional in_bounds attribute to CoalescedGatherDMAOp (per-dim bool array) * Update verifier to allow source/init shape mismatches when in_bounds[dim]=false * Modify GPUConvertToCoalescedDMA to trace through tensor.pad and extract_slice * Compute in_bounds based on padding: true if no padding, false if OOB allowed Constraints: * Low padding must be [0, 0] (no low padding) * Padding value must be constant 0.0 (matches AMD hardware OOB behavior) AMD fat_raw_buffer with boundsCheck=true returns 0 for out-of-bounds reads, providing hardware-level padding semantics without explicit software masking.
1 parent b1d73d0 commit faf553a

File tree

11 files changed

+689
-39
lines changed

11 files changed

+689
-39
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp

Lines changed: 179 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "mlir/Dialect/Utils/StaticValueUtils.h"
2424
#include "mlir/IR/Builders.h"
2525
#include "mlir/IR/BuiltinAttributes.h"
26+
#include "mlir/IR/Matchers.h"
2627
#include "mlir/IR/PatternMatch.h"
2728
#include "mlir/Pass/Pass.h"
2829
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -58,6 +59,15 @@ static SmallVector<Attribute> getThreadMapping(MLIRContext *ctx) {
5859
return mapping;
5960
}
6061

62+
/// Trace through extract_slice operations to find an underlying tensor.pad.
63+
/// Returns the PadOp if found, nullptr otherwise.
64+
static tensor::PadOp traceToTensorPad(Value source) {
65+
while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
66+
source = extractSlice.getSource();
67+
}
68+
return source.getDefiningOp<tensor::PadOp>();
69+
}
70+
6171
/// Check if a value traces back to tensor.empty (possibly through forall args).
6272
static bool tracesToTensorEmpty(Value value) {
6373
// Direct tensor.empty.
@@ -300,14 +310,74 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
300310

301311
Location loc = innerOp.getLoc();
302312
Value source, indices;
313+
SmallVector<bool> inBoundsVec;
303314

304315
// Extract source and indices based on op type.
305316
if constexpr (std::is_same_v<OpTy, linalg::CopyOp>) {
306317
Value input = innerOp.getInputs()[0];
307-
if (auto extractSlice = input.getDefiningOp<tensor::ExtractSliceOp>()) {
308-
source = extractSlice.getSource();
309-
} else {
310-
return failure();
318+
319+
// After tiling, the input is typically:
320+
// tensor.extract_slice %padded[...] [...] [1, 1]
321+
// We need to trace through extract_slice to find if source is tensor.pad.
322+
if (auto pad = traceToTensorPad(input)) {
323+
// Verify pad constraints: low padding must be all zeros, pad value must
324+
// be 0.
325+
bool validPad = true;
326+
for (OpFoldResult low : pad.getMixedLowPad()) {
327+
if (!isConstantIntValue(low, 0)) {
328+
validPad = false;
329+
break;
330+
}
331+
}
332+
Value padVal = pad.getConstantPaddingValue();
333+
if (!padVal || !(matchPattern(padVal, m_AnyZeroFloat()) ||
334+
matchPattern(padVal, m_Zero()))) {
335+
validPad = false;
336+
}
337+
338+
if (validPad) {
339+
// Use pad.getSource() directly as the DMA source.
340+
// This is the tensor.extract_slice result (e.g., tensor<?x64xf32>).
341+
source = pad.getSource();
342+
343+
// Check if source tensor's innermost row size is DWORD (4-byte)
344+
// aligned. On AMD CDNA, per-component range checking is performed for
345+
// each DWORD. If a DWORD is partially out-of-bounds, the entire DWORD
346+
// returns zero, causing incorrect results. Additionally, partial OOB
347+
// triggers the slow path with multi-cycling and instruction issue
348+
// penalties.
349+
auto sourceType = cast<RankedTensorType>(source.getType());
350+
int64_t innermostDim = sourceType.getShape().back();
351+
if (!ShapedType::isDynamic(innermostDim)) {
352+
Type elemType = sourceType.getElementType();
353+
int64_t elemBytes = elemType.getIntOrFloatBitWidth() / 8;
354+
int64_t rowBytes = innermostDim * elemBytes;
355+
if (rowBytes % 4 != 0) {
356+
LLVM_DEBUG(llvm::dbgs()
357+
<< "Skipping DMA: row size " << rowBytes
358+
<< " bytes not DWORD-aligned (slow path)\n");
359+
return failure();
360+
}
361+
}
362+
363+
// Compute in_bounds based on whether padding was added per dimension.
364+
for (auto [low, high] :
365+
llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
366+
bool isInBounds =
367+
isConstantIntValue(low, 0) && isConstantIntValue(high, 0);
368+
inBoundsVec.push_back(isInBounds);
369+
}
370+
}
371+
}
372+
373+
// Fallback: original behavior without tensor.pad fusion.
374+
// Only trace through ONE level of extract_slice (the immediate input).
375+
if (!source) {
376+
if (auto extractSlice = input.getDefiningOp<tensor::ExtractSliceOp>()) {
377+
source = extractSlice.getSource();
378+
} else {
379+
return failure();
380+
}
311381
}
312382
} else if constexpr (std::is_same_v<OpTy, IREE::LinalgExt::GatherOp>) {
313383
source = innerOp.getSource();
@@ -356,15 +426,22 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
356426

357427
// Create the DMA op in the in_parallel region.
358428
rewriter.setInsertionPointToStart(&inParallelBlock);
359-
SmallVector<Value, 1> indicesVec;
429+
SmallVector<Value, 1> indicesOperands;
360430
if (indices) {
361-
indicesVec.push_back(indices);
431+
indicesOperands.push_back(indices);
432+
}
433+
434+
// Create in_bounds attribute if we fused a tensor.pad.
435+
ArrayAttr inBoundsAttr;
436+
if (!inBoundsVec.empty()) {
437+
inBoundsAttr = rewriter.getBoolArrayAttr(inBoundsVec);
362438
}
363439

364440
// When used in forall.in_parallel, the op doesn't return a result
365441
// as it performs an in-place update to the shared_outs tensor.
366442
IREE::GPU::CoalescedGatherDMAOp::create(rewriter, loc, Type(), source,
367-
indicesVec, sharedOut, laneId);
443+
indicesOperands, sharedOut, laneId,
444+
inBoundsAttr);
368445

369446
// Erase the parallel_insert_slice ops and inner operation.
370447
for (tensor::ParallelInsertSliceOp &insertOp : toErase) {
@@ -421,6 +498,58 @@ struct ConvertCopyToCoalescedDMA
421498
}
422499
};
423500

501+
/// Pattern to convert tensor.pad fusion cases directly without requiring
502+
/// warp-mapped forall parent.
503+
struct ConvertPadFusionCopyToCoalescedDMA
504+
: public OpRewritePattern<linalg::CopyOp> {
505+
using OpRewritePattern::OpRewritePattern;
506+
507+
LogicalResult matchAndRewrite(linalg::CopyOp copyOp,
508+
PatternRewriter &rewriter) const override {
509+
// Only match copies with use_global_load_dma config
510+
auto config = getLoweringConfig<IREE::GPU::UseGlobalLoadDMAAttr>(copyOp);
511+
if (!config) {
512+
return failure();
513+
}
514+
515+
// Check if this is a tensor.pad fusion case
516+
auto pad = traceToTensorPad(copyOp.getInputs()[0]);
517+
if (!pad) {
518+
return failure(); // Not a pad fusion case
519+
}
520+
521+
// Check if padding exists (non-zero low/high pad)
522+
bool hasPadding = false;
523+
for (auto [low, high] :
524+
llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
525+
if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
526+
hasPadding = true;
527+
break;
528+
}
529+
}
530+
if (!hasPadding) {
531+
return failure(); // No actual padding
532+
}
533+
534+
// This is a tensor.pad fusion case. Convert directly to
535+
// coalesced_gather_dma without requiring warp-mapped forall.
536+
auto outputType = cast<RankedTensorType>(copyOp.getOutputs()[0].getType());
537+
SmallVector<OpFoldResult> threadNumThreads =
538+
computeThreadNumThreadsImpl(rewriter, copyOp, outputType);
539+
if (threadNumThreads.empty()) {
540+
return failure();
541+
}
542+
543+
scf::ForallOp threadForallOp =
544+
tileToThreadLevel(copyOp, rewriter, threadNumThreads);
545+
if (!threadForallOp) {
546+
return failure();
547+
}
548+
549+
return createDMAInForall<linalg::CopyOp>(threadForallOp, rewriter);
550+
}
551+
};
552+
424553
struct ConvertGatherToCoalescedDMA
425554
: public OpRewritePattern<IREE::LinalgExt::GatherOp> {
426555
using OpRewritePattern<IREE::LinalgExt::GatherOp>::OpRewritePattern;
@@ -574,7 +703,8 @@ struct ConvertGatherToCoalescedDMA
574703
rewriter.setInsertionPointToStart(&inParallelBlock);
575704

576705
IREE::GPU::CoalescedGatherDMAOp::create(rewriter, loc, Type(), source,
577-
indicesVec, sharedOut, laneId);
706+
indicesVec, sharedOut, laneId,
707+
/*in_bounds=*/nullptr);
578708

579709
// Erase parallel_insert_slice ops and gather op.
580710
SmallVector<tensor::ParallelInsertSliceOp> toErase;
@@ -605,9 +735,11 @@ struct GPUConvertToCoalescedDMAPass final
605735
}
606736

607737
// Only tile and convert ops within forall ops with warp mapping.
738+
// Also handle tensor.pad fusion cases that don't have warp mapping.
608739
RewritePatternSet patterns(context);
609740
patterns.add<ConvertGatherToCoalescedDMA>(context);
610741
patterns.add<ConvertCopyToCoalescedDMA>(context);
742+
patterns.add<ConvertPadFusionCopyToCoalescedDMA>(context);
611743

612744
walkAndApplyPatterns(funcOp, std::move(patterns));
613745
}
@@ -758,9 +890,42 @@ struct GPUConvertToCoalescedDMAPass final
758890
return failure();
759891
}
760892

761-
// Compute tile sizes for subgroup-level distribution.
762-
auto [tileSizes, numTiledDims] =
763-
computeSubgroupTileSizes(rewriter, shape, numWarps);
893+
// Check if this is a tensor.pad fusion case.
894+
bool isPadFusion = false;
895+
if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
896+
if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
897+
// Check if padding exists (non-zero low/high pad)
898+
for (auto [low, high] :
899+
llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
900+
if (!isConstantIntValue(low, 0) || !isConstantIntValue(high, 0)) {
901+
isPadFusion = true;
902+
break;
903+
}
904+
}
905+
}
906+
}
907+
908+
SmallVector<OpFoldResult> tileSizes;
909+
int64_t numTiledDims = 0;
910+
911+
if (isPadFusion) {
912+
// For tensor.pad fusion, create a single-iteration wrapper forall
913+
// by setting tile sizes to the full shape. This allows the DMA to
914+
// operate on the full buffer while satisfying the warp-mapped parent
915+
// requirement.
916+
// Bail out if any dimension is dynamic since we need static tile sizes.
917+
if (llvm::any_of(shape, ShapedType::isDynamic)) {
918+
return failure();
919+
}
920+
for (int64_t i = 0; i < rank; ++i) {
921+
tileSizes.push_back(rewriter.getIndexAttr(shape[i]));
922+
++numTiledDims;
923+
}
924+
} else {
925+
// Compute tile sizes for subgroup-level distribution.
926+
std::tie(tileSizes, numTiledDims) =
927+
computeSubgroupTileSizes(rewriter, shape, numWarps);
928+
}
764929

765930
if (numTiledDims == 0) {
766931
return failure();
@@ -798,6 +963,9 @@ struct GPUConvertToCoalescedDMAPass final
798963
});
799964

800965
// Apply subgroup-level tiling to each op.
966+
// For tensor.pad fusion cases, tileAtSubgroupLevel creates a
967+
// single-iteration wrapper forall to maintain the expected structure while
968+
// allowing the DMA to operate on the full buffer.
801969
IRRewriter rewriter(context);
802970
for (Operation *op : opsToTile) {
803971
FailureOr<scf::SCFTilingResult> tilingResult =

compiler/src/iree/compiler/Codegen/Common/GPU/GPUInferMemorySpace.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ bool isDefinitelyShared(bufferization::AllocTensorOp alloc) {
4747
auto forallOp = dyn_cast<scf::ForallOp>(user);
4848
if (!forallOp ||
4949
!forallOpHasMappingType<gpu::GPUThreadMappingAttr,
50-
gpu::GPUWarpMappingAttr>(forallOp)) {
50+
gpu::GPUWarpMappingAttr, IREE::GPU::LaneIdAttr>(
51+
forallOp)) {
5152
return false;
5253
}
5354
}

0 commit comments

Comments
 (0)