intel
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 14 additions & 10 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 0 additions & 31 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 0 additions & 31 deletions
@@ -52,21 +52,25 @@ LogicalResult UpcastMXFPOp::verify() {
         "all dimensions except the last must match between operands");
   }
 
-  auto layoutX = xTy.getEncoding();
-  if (!layoutX || !isa<DotOperandEncodingAttr>(layoutX)) {
+  auto dotEncoding =
+      dyn_cast_or_null<DotOperandEncodingAttr>(xTy.getEncoding());
+  if (!dotEncoding) {
     return emitOpError("Expected a DotOperandEncodingAttr for values");
   }
-  auto layoutScale = scaleTy.getEncoding();
-  if (!layoutScale || !isa<BlockedEncodingAttr>(layoutScale)) {
+
+  auto blockedScale =
+      dyn_cast_or_null<BlockedEncodingAttr>(scaleTy.getEncoding());
+  if (!blockedScale) {
     return emitOpError("Expected a BlockOperandEncoding for scales");
   }
-  auto blockedScale = cast<BlockedEncodingAttr>(layoutScale);
 
-  // Necessary to keep all of the scales of a given block of values in the same
-  // warp
-  auto threadsPerWarp = blockedScale.getThreadsPerWarp();
-  if (threadsPerWarp != ArrayRef<unsigned>({16, 2})) {
-    return emitOpError("Expected threads per warp to be {16, 2}");
+  if (isa<NvidiaMmaEncodingAttr>(dotEncoding.getParent())) {
+    // Necessary to keep all of the scales of a given block of values in the
+    // same warp
+    auto threadsPerWarp = blockedScale.getThreadsPerWarp();
+    if (threadsPerWarp != ArrayRef<unsigned>({16, 2})) {
+      return emitOpError("Expected threads per warp to be {16, 2}");
+    }
   }
 
   return success();
 
@@ -450,37 +450,6 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
         // If we can't agree on a shared encoding skip pipelinig the load.
         if (incompatible)
           continue;
-
-        // HACK: Triton LLVM codegen has a bug where local_loads from #shared to
-        // #mma layout can lead to invalid code if the loaded shape is smaller
-        // than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with
-        // tile {16,8} is bad because 1 < 8).  To work around this, don't
-        // pipeline such loads.
-        //
-        // The codegen bug is caught by an assertion, so if you think you've
-        // fixed it, feel free to delete this code and see if the assert still
-        // fails.  :)
-        if (!loadInfo.sharedEncoding) {
-          if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
-                  dot.getResult().getType().getEncoding())) {
-            auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
-            auto mmaInstrShape = dotEnc.getInstrShape();
-            if (loadTy.getRank() < mmaInstrShape.size())
-              continue;
-            bool ok = true;
-            for (int i = 0; i < mmaInstrShape.size(); i++) {
-              if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() +
-                                    i] < mmaInstrShape[i]) {
-                ok = false;
-                break;
-              }
-            }
-            // If this load might trigger the bug, don't do the fallback logic
-            // below, which might allow the load to be pipelined.
-            if (!ok)
-              continue;
-          }
-        }
       }
     } else if (auto loadOp = dyn_cast<tt::LoadOp>(use)) {
       // The use of this loadOp is another loadOp. If the use is not in the