Calculate block load tile layout dim size from bases (#3971)

alexbaden · web-flow · commit bd88137b76b2 · 2025-04-23T16:44:38.000-04:00
Previously we attempted to compute the total size for the 2D block loads
according to the tensor layout using input parameters like tensor shape
and warp shape. This can be error prone, since the tensor shape is
manipulated according to the warp distribution. A cleaner solution is to
modify the dimension sizes according to the bases. By loading the
dimensions from the tile layout in addition to the bases, we can modify
the dimension sizes using the same metrics used to construct the bases.
This appears to be giving correct results using the `test_block_load`
tests. I did not use this approach initially because I was concerned
about the load tile being too "big", but because we incorporate strides
in the loads now this approach should faithfully represent the total
dimensionality of the loaded data.
diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py
@@ -6,7 +6,8 @@
 from triton._internal_testing import is_xpu
 
 
-@pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [128, 16], [128, 8], [64, 64], [64, 32], [32, 32]])
+@pytest.mark.parametrize("M, N",
+                         [[256, 64], [256, 32], [128, 32], [128, 16], [128, 8], [64, 64], [64, 32], [32, 32], [16, 64]])
 @pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"])
 @pytest.mark.parametrize("transpose", [True, False])
 @pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1493,16 +1493,28 @@ struct LoadOpConversion
     // layout.
     auto bases = tileLayout.getBases();
     std::vector<std::vector<int32_t>> newLoadBases;
+
+    SmallVector<std::pair<StringAttr, int32_t>> outDims;
+    for (auto [name, size] :
+         llvm::zip(tileLayout.getOutDimNames(), tileLayout.getOutDimSizes())) {
+      outDims.push_back(std::make_pair(name, size));
+    }
+    assert(outDims[0].first == S("dim0"));
+    assert(outDims[1].first == S("dim1"));
+
     for (size_t i = 0;
          i < llvm::Log2_32(numRepInner / numOperandsInnerDimPerLoad); i++) {
       newLoadBases.push_back({0, static_cast<int>((1 << i) * repKStride *
                                                   numOperandsInnerDimPerLoad)});
+      outDims[1].second *= repKStride * numOperandsInnerDimPerLoad;
     }
     for (size_t i = 0; i < llvm::Log2_32(numLoadPerOutRepCluster); i++) {
       newLoadBases.push_back({static_cast<int>((1 << i) * repStride), 0});
+      outDims[0].second *= repStride;
     }
     for (size_t i = 0; i < llvm::Log2_32(numRepOuter); i++) {
       newLoadBases.push_back({static_cast<int>((1 << i) * repOuterStride), 0});
+      outDims[0].second *= repOuterStride;
     }
 
     LLVM_DEBUG({
@@ -1513,23 +1525,6 @@ struct LoadOpConversion
       }
     });
 
-    SmallVector<std::pair<StringAttr, int32_t>> outDims;
-    // Copy the existing dimensions first. This allows us to re-use the existing
-    // dim names as well as the sizes should the bases vector be empty (one
-    // load).
-    for (auto [name, size] :
-         llvm::zip(tileLayout.getOutDimNames(), tileLayout.getOutDimSizes())) {
-      outDims.push_back(std::make_pair(name, size));
-    }
-    if (newLoadBases.size() > 0) {
-      outDims[0] = std::make_pair(outDims[0].first, tensorShape[dimOuter]);
-      outDims[1] = std::make_pair(
-          outDims[1].first,
-          std::max(warpShape[dimInner],
-                   static_cast<unsigned int>(tensorShape[dimInner] *
-                                             repCluster[dimInner])));
-    }
-
     LLVM_DEBUG({
       llvm::dbgs() << "New tile layout dimensions after adding load bases:\n";
       for (size_t i = 0; i < outDims.size(); i++) {