|
13 | 13 | #define GEN_PASS_CLASSES |
14 | 14 | #include "TritonAMDGPUTransforms/Passes.h" |
15 | 15 |
|
| 16 | +#define DEBUG_TYPE "tritonamdgpu-block-pingpong" |
| 17 | +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
| 18 | +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| 19 | + |
16 | 20 | using namespace mlir; |
17 | 21 | namespace ttg = mlir::triton::gpu; |
18 | 22 | namespace tt = mlir::triton; |
@@ -447,9 +451,14 @@ void Pingponger::getDotPingponged() { |
447 | 451 | // software pipelining and dot rank=2. Also only accept the for-loop with |
448 | 452 | // supported combination of operations because this transformation is very |
449 | 453 | // tightly scheduling the latencies. |
450 | | - if (gLoadOps.size() < 2 || lLoadOps.size() < 2 || dotOps.size() != 1) |
| 454 | + if (gLoadOps.size() < 2 || lLoadOps.size() < 2 || dotOps.size() != 1) { |
| 455 | + std::stringstream message; |
| 456 | + message << "Unable to match ping pong scheduling pattern. Details: " |
| 457 | + << gLoadOps.size() << " global loads, " << lLoadOps.size() |
| 458 | + << " local loads, " << dotOps.size() << " dot products"; |
| 459 | + LDBG(message.str()); |
451 | 460 | return; |
452 | | - |
| 461 | + } |
453 | 462 | // Pingpong scheduling tries to form two different types of the instruction |
454 | 463 | // clusters, i.e., Dot clusters and Memory clusters. While each SIMD has |
455 | 464 | // two concurrent warps, both warps can execute a different type of |
@@ -509,18 +518,32 @@ void Pingponger::getDotPingponged() { |
509 | 518 | // numWarps=4 doesn't need asymmetric sync, return. |
510 | 519 | return; |
511 | 520 | } else if (numWarps == 8) { // Pingpong between warps from the same block |
512 | | - if (gLoadOps.size() != 2 || lLoadOps.size() != 2) |
| 521 | + if (gLoadOps.size() != 2 || lLoadOps.size() != 2) { |
| 522 | + std::stringstream message; |
| 523 | + message << "Unable to match ping pong slicing pattern. Details: " |
| 524 | + << gLoadOps.size() << " global loads, " << lLoadOps.size() |
| 525 | + << " local loads"; |
| 526 | + LDBG(message.str()); |
513 | 527 | return; |
| 528 | + } |
514 | 529 | // Transform a loop where the tile size requires dots to be sliced |
515 | 530 | if (tileSize == mediumTile) { |
516 | | - if (transformTwoPPClusters(builder, dotOps[0]->getLoc()).failed()) |
| 531 | + if (transformTwoPPClusters(builder, dotOps[0]->getLoc()).failed()) { |
| 532 | + LDBG("Encountered failure when trying to execute the two ping pong " |
| 533 | + "cluster transformation"); |
517 | 534 | return; |
| 535 | + } |
518 | 536 | } else if (tileSize >= largeTile) { |
519 | 537 | // Avoid known register spilling. i.e., mfma16x16x16 & largetile & kpack>1 |
520 | | - if (intShape[0] == 16 && intShape[1] == 16 && kWidth == 8) |
| 538 | + if (intShape[0] == 16 && intShape[1] == 16 && kWidth == 8) { |
| 539 | + LDBG("Reached known register spilling case, skip pingpong scheduling"); |
521 | 540 | return; |
522 | | - if (transformFourPPClusters(builder, dotOps[0]->getLoc()).failed()) |
| 541 | + } |
| 542 | + if (transformFourPPClusters(builder, dotOps[0]->getLoc()).failed()) { |
| 543 | + LDBG("Encountered failure when trying to execute the four ping pong " |
| 544 | + "cluster transformation"); |
523 | 545 | return; |
| 546 | + } |
524 | 547 | } else |
525 | 548 | return; |
526 | 549 |
|
|
0 commit comments