Skip to content

Commit b578791

Browse files
njriasanNick Riasanovsky
authored andcommitted
[AMD] Add debug prints for pingpong scheduler failures (triton-lang#5975)
Adds LLVM Debug messages when the Ping Pong Scheduler fails to execute due to a reason that is not easily to statically calculate. --------- Co-authored-by: Nick Riasanovsky <[email protected]>
1 parent 16c3c1a commit b578791

File tree

1 file changed

+29
-6
lines changed

1 file changed

+29
-6
lines changed

third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
#define GEN_PASS_CLASSES
1414
#include "TritonAMDGPUTransforms/Passes.h"
1515

16+
#define DEBUG_TYPE "tritonamdgpu-block-pingpong"
17+
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
18+
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
19+
1620
using namespace mlir;
1721
namespace ttg = mlir::triton::gpu;
1822
namespace tt = mlir::triton;
@@ -447,9 +451,14 @@ void Pingponger::getDotPingponged() {
447451
// software pipelining and dot rank=2. Also only accept the for-loop with
448452
// supported combination of operations because this transformation is very
449453
// tightly scheduling the latencies.
450-
if (gLoadOps.size() < 2 || lLoadOps.size() < 2 || dotOps.size() != 1)
454+
if (gLoadOps.size() < 2 || lLoadOps.size() < 2 || dotOps.size() != 1) {
455+
std::stringstream message;
456+
message << "Unable to match ping pong scheduling pattern. Details: "
457+
<< gLoadOps.size() << " global loads, " << lLoadOps.size()
458+
<< " local loads, " << dotOps.size() << " dot products";
459+
LDBG(message.str());
451460
return;
452-
461+
}
453462
// Pingpong scheduling tries to form two different types of the instruction
454463
// clusters, i.e., Dot clusters and Memory clusters. While each SIMD has
455464
// two concurrent warps, both warps can execute a different type of
@@ -509,18 +518,32 @@ void Pingponger::getDotPingponged() {
509518
// numWarps=4 doesn't need asymmetric sync, return.
510519
return;
511520
} else if (numWarps == 8) { // Pingpong between warps from the same block
512-
if (gLoadOps.size() != 2 || lLoadOps.size() != 2)
521+
if (gLoadOps.size() != 2 || lLoadOps.size() != 2) {
522+
std::stringstream message;
523+
message << "Unable to match ping pong slicing pattern. Details: "
524+
<< gLoadOps.size() << " global loads, " << lLoadOps.size()
525+
<< " local loads";
526+
LDBG(message.str());
513527
return;
528+
}
514529
// Transform a loop where the tile size requires dots to be sliced
515530
if (tileSize == mediumTile) {
516-
if (transformTwoPPClusters(builder, dotOps[0]->getLoc()).failed())
531+
if (transformTwoPPClusters(builder, dotOps[0]->getLoc()).failed()) {
532+
LDBG("Encountered failure when trying to execute the two ping pong "
533+
"cluster transformation");
517534
return;
535+
}
518536
} else if (tileSize >= largeTile) {
519537
// Avoid known register spilling. i.e., mfma16x16x16 & largetile & kpack>1
520-
if (intShape[0] == 16 && intShape[1] == 16 && kWidth == 8)
538+
if (intShape[0] == 16 && intShape[1] == 16 && kWidth == 8) {
539+
LDBG("Reached known register spilling case, skip pingpong scheduling");
521540
return;
522-
if (transformFourPPClusters(builder, dotOps[0]->getLoc()).failed())
541+
}
542+
if (transformFourPPClusters(builder, dotOps[0]->getLoc()).failed()) {
543+
LDBG("Encountered failure when trying to execute the four ping pong "
544+
"cluster transformation");
523545
return;
546+
}
524547
} else
525548
return;
526549

0 commit comments

Comments
 (0)