revert triton-lang/triton#4774 (#4873)

ThomasRaoux · web-flow · commit 9e7204754b33 · 2024-10-08T16:31:04.000-07:00
This change is causing failures in some internal tests. There must still
be some miscompile associated with this.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -450,6 +450,37 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
         // If we can't agree on a shared encoding skip pipelinig the load.
         if (incompatible)
           continue;
+
+        // HACK: Triton LLVM codegen has a bug where local_loads from #shared to
+        // #mma layout can lead to invalid code if the loaded shape is smaller
+        // than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with
+        // tile {16,8} is bad because 1 < 8).  To work around this, don't
+        // pipeline such loads.
+        //
+        // The codegen bug is caught by an assertion, so if you think you've
+        // fixed it, feel free to delete this code and see if the assert still
+        // fails.  :)
+        if (!loadInfo.sharedEncoding) {
+          if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
+                  dot.getResult().getType().getEncoding())) {
+            auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
+            auto mmaInstrShape = dotEnc.getInstrShape();
+            if (loadTy.getRank() < mmaInstrShape.size())
+              continue;
+            bool ok = true;
+            for (int i = 0; i < mmaInstrShape.size(); i++) {
+              if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() +
+                                    i] < mmaInstrShape[i]) {
+                ok = false;
+                break;
+              }
+            }
+            // If this load might trigger the bug, don't do the fallback logic
+            // below, which might allow the load to be pipelined.
+            if (!ok)
+              continue;
+          }
+        }
       }
     } else if (auto loadOp = dyn_cast<tt::LoadOp>(use)) {
       // The use of this loadOp is another loadOp. If the use is not in the
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
@@ -1453,8 +1453,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
 // -----
 
 // COMMON-LABEL: @dont_pipeline_128x1
-// AMD-NOT: local_load{{.*}}128x1
-// CHECK: local_load{{.*}}128x1
+// COMMON-NOT: local_load{{.*}}128x1
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {