intel
diff --git a/‎lib/Dialect/Triton/Transforms/Combine.cpp‎
Lines changed: 5 additions & 0 deletions b/‎lib/Dialect/Triton/Transforms/Combine.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 3 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 3 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp‎
Lines changed: 32 additions & 13 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp‎
Lines changed: 32 additions & 13 deletions
diff --git a/‎python/src/ir.cc‎
Lines changed: 3 additions & 6 deletions b/‎python/src/ir.cc‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎python/test/gluon/test_lowerings.py‎
Lines changed: 45 additions & 0 deletions b/‎python/test/gluon/test_lowerings.py‎
Lines changed: 45 additions & 0 deletions
@@ -252,6 +252,11 @@ class CombineDotAddPattern : public mlir::OpRewritePattern<OpTy> {
     }
     if (!isZero(dotOp.getC()))
       return failure();
+    if constexpr (std::is_same_v<OpTy, arith::AddFOp>) {
+      if (dotOp.getMaxNumImpreciseAcc() != 0) {
+        return failure();
+      }
+    }
     rewriter.modifyOpInPlace(dotOp, [&] {
       dotOp.getCMutable().assign(isDotLHS ? addOp.getRhs() : addOp.getLhs());
       dotOp->moveBefore(addOp);
 
@@ -692,18 +692,19 @@ LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
   int mIndex = 0 + hasBatchDim;
 
   int32_t kWidth = dotMfmaLayout.getKWidth();
-  auto kDimIndex = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
+  auto nonKDimIndex = dotMfmaLayout.getOpIdx() == 0 ? rank - 2 : rank - 1;
 
   auto warpsPerCTA = mfmaLayout.getWarpsPerCTA();
   auto tilesPerWarp = mfmaLayout.getTilesPerWarp();
-  auto tilePerWarpNonK = tilesPerWarp[kDimIndex];
+  auto tilePerWarpNonK = tilesPerWarp[nonKDimIndex];
 
   auto mDim = mfmaLayout.getMDim();
   auto nDim = mfmaLayout.getNDim();
   auto opIdx = dotMfmaLayout.getOpIdx();
   auto nonKDim = opIdx == 0 ? mDim : nDim;
   constexpr int warpSize = 64;
 
+  auto kDimIndex = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
   int32_t kSize = shape[kDimIndex];
 
   MLIRContext *ctx = dotMfmaLayout.getContext();
 
@@ -1006,6 +1006,9 @@ static LogicalResult speculateInnerLoopLength(scf::ForOp outerLoop,
   newInnerLoop.replaceAllUsesWith(newInnerLoop.getInits());
   newInnerLoop.erase();
 
+  // Clear up the warp specialization attributes for the specialized loop.
+  newLoop->removeAttr(kWarpSpecializeAttrName);
+
   // Move the loop nest into the `else` branch.
   outerLoop.replaceAllUsesWith(ifOp.getResults());
   Block *block = b.createBlock(&ifOp.getElseRegion());
 
@@ -1,7 +1,10 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
 #include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
@@ -202,17 +205,12 @@ LogicalResult DependencyRewriter::run() {
        llvm::zip(schedule.getPartitions(), partitionUseInfo)) {
     // The amount of buffering is based on the longest distance to a user.
     for (auto &[output, info] : useInfo) {
-      // FIXME: No IR support for passing simple scalars through shared
-      // memory.
-      auto tensorType = dyn_cast<RankedTensorType>(output.getType());
-      if (!tensorType) {
-        return mlir::emitWarning(output.getLoc(),
-                                 "FIXME: only tensor SSA dependencies between "
-                                 "partitions are supported");
-      }
+      b.setLoc(output.getLoc());
+      ImplicitLocOpBuilder endBuilder(b.getLoc(), loop->getNextNode());
 
-      Operation *defOp;
+      bool isScalar = false;
       Value tmp = output;
+      Operation *defOp;
       while (true) {
         if (auto arg = dyn_cast<BlockArgument>(tmp)) {
           tmp = loop.getBody()->getTerminator()->getOperand(arg.getArgNumber() -
@@ -222,14 +220,31 @@ LogicalResult DependencyRewriter::run() {
         defOp = tmp.getDefiningOp();
         break;
       }
+      Value val = output;
+      auto tensorType = dyn_cast<RankedTensorType>(output.getType());
+      if (!tensorType) {
+        isScalar = true;
+        b.setInsertionPointAfterValue(output);
+        auto mod = output.getParentRegion()->getParentOfType<ModuleOp>();
+        auto nWarps = lookupNumWarps(mod);
+        auto threadsPerWarp =
+            triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod);
+        int CTAs = triton::gpu::TritonGPUDialect::getNumCTAs(mod);
+        Attribute encoding = getDefaultBlockedEncoding(
+            b.getContext(), {1}, nWarps, threadsPerWarp, CTAs);
+        tensorType = RankedTensorType::get({1}, output.getType(), encoding);
+        StageCluster srcStageCluster = getStageCluster(defOp);
+
+        defOp = b.createInto<triton::SplatOp>(partition, srcStageCluster,
+                                              tensorType, output);
+        val = defOp->getResult(0);
+      }
 
       // Buffer the value based on the greatest distance to a consumer
       // partition.
       int maxDistance = info.getMaxUseDistance(partition);
 
       // Allocate buffers for the value and its associated barriers.
-      b.setLoc(output.getLoc());
-      ImplicitLocOpBuilder endBuilder(b.getLoc(), loop->getNextNode());
       AsyncRef aref = allocateAsyncValue(tensorType, maxDistance);
 
       unsigned numConsumers = info.consumers.size();
@@ -249,20 +264,24 @@ LogicalResult DependencyRewriter::run() {
         // partition with it.
         Value value = b.createInto<LocalLoadOp>(*usePartition, sinkSrcCluster,
                                                 tensorType, view);
+        if (isScalar) {
+          value = b.createInto<triton::UnsplatOp>(*usePartition, sinkSrcCluster,
+                                                  value);
+        }
         for (OpOperand *use : uses)
           use->set(value);
         exitOp(b);
       }
 
       // Set up production of the value
-      if (isa<BlockArgument>(output))
+      if (isa<BlockArgument>(val))
         b.setInsertionPointToStart(loop.getBody());
       else
         b.setInsertionPointAfter(defOp);
 
       StageCluster srcStageCluster = getStageCluster(defOp);
       auto [view, exitOp] = aref.putView(b, partition, srcStageCluster);
-      b.createInto<LocalStoreOp>(partition, srcStageCluster, output, view);
+      b.createInto<LocalStoreOp>(partition, srcStageCluster, val, view);
       exitOp(b);
     }
   }
 
@@ -554,6 +554,8 @@ void init_triton_ir(py::module &&m) {
            })
       .def("verify",
            [](OpState &self) -> bool {
+             TritonSourceMgrDiagnosticHandler handler =
+                 setupTritonDiagnosticHandler(self.getContext());
              return succeeded(verify(self.getOperation()));
            })
       .def("get_operation", [](OpState &self) { return self.getOperation(); });
@@ -700,12 +702,7 @@ void init_triton_ir(py::module &&m) {
       .def("walk",
            [](ModuleOp &self, const std::function<void(Operation *)> &fn) {
              self.walk(fn);
-           })
-      .def("verify_with_diagnostics", [](ModuleOp &self) {
-        TritonSourceMgrDiagnosticHandler handler =
-            setupTritonDiagnosticHandler(self.getContext());
-        return succeeded(verify(self.getOperation()));
-      });
+           });
 
   m.def("make_attr", [](const std::vector<int> &values, MLIRContext &context) {
     return mlir::cast<Attribute>(DenseIntElementsAttr::get(
 
@@ -1209,3 +1209,48 @@ def test_gather_layouts(axis, src_layout, index_layout, src_shape, idx_shape, de
 
     torch.testing.assert_close(out, ref, rtol=0, atol=0)
     assert ("nvvm.shfl.sync.idx" in obj.asm["llir"]) or ("llvm.amdgcn.ds.bpermute" in obj.asm["llir"])
+
+
+@pytest.mark.parametrize("M, N, M_tile_size, N_tile_size",
+                         [[128, 128, 64, 64], [128, 128, 64, 32], [128, 64, 64, 32], [256, 128, 64, 64]])
+def test_memdesc_subslice(M, N, M_tile_size, N_tile_size, device):
+    if M % M_tile_size != 0 or N % N_tile_size != 0:
+        pytest.skip(f"Shape size ({M}, {N}) must be divisible by tile size ({M_tile_size}, {N_tile_size})")
+
+    num_rows_per_warp = THREADS_PER_WARP // 4
+    blocked_layout = ttgl.BlockedLayout(size_per_thread=[1, 8], threads_per_warp=[num_rows_per_warp, 4],
+                                        warps_per_cta=[4, 1], order=[1, 0])
+    shared_layout = ttgl.SwizzledSharedLayout(vec=8, per_phase=1, max_phase=8, order=[1, 0])
+
+    @gluon.jit
+    def kernel(
+        out,
+        M: ttgl.constexpr,
+        N: ttgl.constexpr,
+        BLOCK_SIZE_M: ttgl.constexpr,
+        BLOCK_SIZE_N: ttgl.constexpr,
+        blocked_layout: ttgl.constexpr,
+        shared_layout: ttgl.constexpr,
+    ):
+        offs_m = ttgl.arange(0, M, layout=ttgl.SliceLayout(1, blocked_layout))[:, None]
+        offs_n = ttgl.arange(0, N, layout=ttgl.SliceLayout(0, blocked_layout))[None, :]
+        vals = ttgl.load(out + offs_m * N + offs_n)
+
+        smem: ttgl.shared_memory_descriptor = ttgl.allocate_shared_memory(vals.dtype, (M, N), shared_layout, value=vals)
+        for i in ttgl.static_range(M // BLOCK_SIZE_M):
+            for j in ttgl.static_range(N // BLOCK_SIZE_N):
+                tile = smem.slice(i * BLOCK_SIZE_M, BLOCK_SIZE_M, dim=0).slice(j * BLOCK_SIZE_N, BLOCK_SIZE_N, dim=1)
+                tile_vals = tile.load(blocked_layout)
+                tile_offs_m = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, blocked_layout))[:, None]
+                tile_offs_n = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, blocked_layout))[None, :]
+                linear_idx = tile_offs_m * N + tile_offs_n + i * BLOCK_SIZE_M * N + j * BLOCK_SIZE_N
+                tile.store(linear_idx + tile_vals)
+
+        vals = smem.load(blocked_layout)
+        ttgl.store(out + offs_m * N + offs_n, vals)
+
+    out = torch.zeros((M, N), device=device, dtype=torch.float16)
+    kernel[(1, )](out, M, N, M_tile_size, N_tile_size, blocked_layout, shared_layout)
+
+    out_ref = torch.arange(0, M * N, device=device).reshape((M, N)).to(torch.float16)
+    torch.testing.assert_close(out, out_ref, rtol=0, atol=0)