|
8 | 8 |
|
9 | 9 | #include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h" |
10 | 10 | #include "mlir/Dialect/SCF/IR/SCF.h" |
| 11 | +#include "mlir/Dialect/SCF/Utils/Utils.h" |
11 | 12 | #include "mlir/Dialect/XeGPU/IR/XeGPU.h" |
12 | 13 | #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" |
13 | 14 |
|
@@ -341,6 +342,143 @@ void transform::SetOpLayoutAttrOp::getEffects( |
341 | 342 | modifiesPayload(effects); |
342 | 343 | } |
343 | 344 |
|
| 345 | +DiagnosedSilenceableFailure |
| 346 | +transform::InsertPrefetchOp::apply(transform::TransformRewriter &rewriter, |
| 347 | + transform::TransformResults &results, |
| 348 | + transform::TransformState &state) { |
| 349 | + auto targetValues = state.getPayloadValues(getTarget()); |
| 350 | + if (!llvm::hasSingleElement(targetValues)) { |
| 351 | + return emitDefiniteFailure() |
| 352 | + << "requires exactly one target value handle (got " |
| 353 | + << llvm::range_size(targetValues) << ")"; |
| 354 | + } |
| 355 | + auto value = *targetValues.begin(); |
| 356 | + |
| 357 | + int64_t nbPrefetch = getStaticNbPrefetch(); |
| 358 | + if (getDynamicNbPrefetch()) { |
| 359 | + // Get dynamic prefetch count from transform param or handle. |
| 360 | + SmallVector<int32_t> dynamicNbPrefetch; |
| 361 | + auto status = convertMixedValuesToInt(state, (*this), dynamicNbPrefetch, |
| 362 | + {getDynamicNbPrefetch()}); |
| 363 | + if (!status.succeeded()) |
| 364 | + return status; |
| 365 | + if (dynamicNbPrefetch.size() != 1) { |
| 366 | + return emitDefiniteFailure() |
| 367 | + << "requires exactly one value for dynamic_nb_prefetch"; |
| 368 | + } |
| 369 | + nbPrefetch = dynamicNbPrefetch[0]; |
| 370 | + } |
| 371 | + if (nbPrefetch <= 0) { |
| 372 | + return emitSilenceableFailure(getLoc()) |
| 373 | + << "nb_prefetch must be a positive integer."; |
| 374 | + } |
| 375 | + |
| 376 | + // Find load operation of the operand. |
| 377 | + auto maybeLoadOp = findProducerOfType<xegpu::LoadNdOp>(value); |
| 378 | + if (!maybeLoadOp) { |
| 379 | + return emitSilenceableFailure(getLoc()) << "Could not find load op."; |
| 380 | + } |
| 381 | + auto loadOp = *maybeLoadOp; |
| 382 | + if (loadOp.getMixedOffsets().size() == 0) { |
| 383 | + auto diag = emitSilenceableFailure(getLoc()) |
| 384 | + << "Load op must have offsets."; |
| 385 | + diag.attachNote(loadOp.getLoc()) << "load op"; |
| 386 | + return diag; |
| 387 | + } |
| 388 | + |
| 389 | + // Find the parent scf.for loop. |
| 390 | + auto forOp = loadOp->getParentOfType<scf::ForOp>(); |
| 391 | + if (!forOp) { |
| 392 | + auto diag = emitSilenceableFailure(getLoc()) |
| 393 | + << "Load op is not contained in a scf.for loop."; |
| 394 | + diag.attachNote(loadOp.getLoc()) << "load op"; |
| 395 | + return diag; |
| 396 | + } |
| 397 | + |
| 398 | + // Find descriptor op. |
| 399 | + auto maybeDescOp = findProducerOfType<xegpu::CreateNdDescOp>(value); |
| 400 | + if (!maybeDescOp) { |
| 401 | + return emitSilenceableFailure(getLoc()) << "Could not find descriptor op."; |
| 402 | + } |
| 403 | + auto descOp = *maybeDescOp; |
| 404 | + if (descOp.getMixedOffsets().size() > 0) { |
| 405 | + auto diag = emitSilenceableFailure(getLoc()) |
| 406 | + << "desc op with offsets is not supported."; |
| 407 | + diag.attachNote(descOp.getLoc()) << "desc op"; |
| 408 | + } |
| 409 | + |
| 410 | + // Clone desc op outside the loop. |
| 411 | + rewriter.setInsertionPoint(forOp); |
| 412 | + auto newDescOp = |
| 413 | + cast<xegpu::CreateNdDescOp>(rewriter.clone(*descOp.getOperation())); |
| 414 | + |
| 415 | + // Clone reduction loop to emit initial prefetches. |
| 416 | + // Compute upper bound of the init loop: start + nbPrefetch * step. |
| 417 | + auto nbPrefetchCst = |
| 418 | + arith::ConstantIndexOp::create(rewriter, forOp.getLoc(), nbPrefetch); |
| 419 | + auto nbStep = rewriter.createOrFold<arith::MulIOp>( |
| 420 | + forOp.getLoc(), nbPrefetchCst, forOp.getStep()); |
| 421 | + auto initUpBound = rewriter.createOrFold<arith::AddIOp>( |
| 422 | + forOp.getLoc(), forOp.getLowerBound(), nbStep); |
| 423 | + auto initForOp = |
| 424 | + scf::ForOp::create(rewriter, forOp.getLoc(), forOp.getLowerBound(), |
| 425 | + initUpBound, forOp.getStep()); |
| 426 | + |
| 427 | + auto ctx = rewriter.getContext(); |
| 428 | + auto readCacheHint = |
| 429 | + xegpu::CachePolicyAttr::get(ctx, xegpu::CachePolicy::CACHED); |
| 430 | + |
| 431 | + // Modify loadOp mixedOffsets by replacing the for loop induction variable |
| 432 | + // with the given value. |
| 433 | + auto getPrefetchOffsets = |
| 434 | + [&](Value replacementVal) -> SmallVector<OpFoldResult> { |
| 435 | + IRMapping mapping; |
| 436 | + mapping.map(forOp.getInductionVar(), replacementVal); |
| 437 | + SmallVector<Value> dynamicOffsets = |
| 438 | + llvm::to_vector(llvm::map_range(loadOp.getOffsets(), [&](Value v) { |
| 439 | + return mapping.lookupOrDefault(v); |
| 440 | + })); |
| 441 | + auto constOffsets = loadOp.getConstOffsets().value(); |
| 442 | + return getMixedValues(constOffsets, dynamicOffsets, ctx); |
| 443 | + }; |
| 444 | + |
| 445 | + // Insert prefetch op in init loop. |
| 446 | + // Replace induction var with the init loop induction var. |
| 447 | + rewriter.setInsertionPointToStart(initForOp.getBody()); |
| 448 | + xegpu::PrefetchNdOp::create(rewriter, newDescOp.getLoc(), |
| 449 | + newDescOp.getResult(), |
| 450 | + getPrefetchOffsets(initForOp.getInductionVar()), |
| 451 | + readCacheHint, readCacheHint, readCacheHint); |
| 452 | + |
| 453 | + // Insert prefetch op in main loop. |
| 454 | + // Calculate prefetch offset after the init prefetches have been issued. |
| 455 | + rewriter.setInsertionPointToStart(forOp.getBody()); |
| 456 | + auto prefetchOffset = arith::AddIOp::create(rewriter, forOp.getLoc(), |
| 457 | + forOp.getInductionVar(), nbStep); |
| 458 | + // Replace induction var with correct offset. |
| 459 | + xegpu::PrefetchNdOp::create(rewriter, newDescOp.getLoc(), |
| 460 | + newDescOp.getResult(), |
| 461 | + getPrefetchOffsets(prefetchOffset), readCacheHint, |
| 462 | + readCacheHint, readCacheHint); |
| 463 | + |
| 464 | + // Unroll the init loop. |
| 465 | + if (failed(loopUnrollFull(initForOp))) { |
| 466 | + return emitSilenceableFailure(getLoc()) << "Failed to unroll the loop"; |
| 467 | + } |
| 468 | + |
| 469 | + results.set(llvm::cast<OpResult>(getResult()), {newDescOp}); |
| 470 | + |
| 471 | + return DiagnosedSilenceableFailure::success(); |
| 472 | +} |
| 473 | + |
| 474 | +void transform::InsertPrefetchOp::getEffects( |
| 475 | + ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) { |
| 476 | + onlyReadsHandle(getTargetMutable(), effects); |
| 477 | + onlyReadsHandle(getDynamicNbPrefetchMutable(), effects); |
| 478 | + producesHandle(getOperation()->getOpResults(), effects); |
| 479 | + modifiesPayload(effects); |
| 480 | +} |
| 481 | + |
344 | 482 | namespace { |
345 | 483 | class XeGPUTransformDialectExtension |
346 | 484 | : public transform::TransformDialectExtension< |
|
0 commit comments