Merge commit 'c24aa15e30ebacd16799dadf1fe86d954ea5db97'

whitneywhtsang · whitneywhtsang · commit 268b414e48b8 · 2025-05-07T00:06:24.000Z
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -228,33 +228,11 @@ sharedToLinearLayoutAMDRotating(ArrayRef<int64_t> shape,
   return combineCtaCgaWithShape(ctaLayout, shared.getCTALayout(), shape);
 }
 
-} // namespace
-
-LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
-                                       NVMMASharedEncodingAttr shared,
+// Returns the layout of a single core matrix which tiles the nvmma layout
+LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle) {
-  MLIRContext *ctx = shared.getContext();
-  int rank = shape.size();
-  auto shapePerCTA = getShapePerCTA(shared, shape);
-  if (rank == 1) {
-    // TODO: Not sure if this is correct.
-    return combineCtaCgaWithShape(
-        LinearLayout::identity1D(shapePerCTA[0], S("offset"), S("dim0")),
-        shared.getCTALayout(), shape);
-  }
-  // Construct bases for a the layout's 2-dimensional tile.
-  assert(rank >= 2);
-  int batchDims = rank - 2;
+  auto *ctx = shared.getContext();
 
-  // Collapse all the outer dim into one. We will then create a layout for this
-  // shape and reshape it to the original shape.
-  std::array<int64_t, 2> collapsedShapePerCTA = {shapePerCTA[batchDims],
-                                                 shapePerCTA[batchDims + 1]};
-  for (int i = 0; i < batchDims; i++)
-    collapsedShapePerCTA[0] *= shapePerCTA[i];
-  if (shared.getTransposed()) {
-    std::swap(collapsedShapePerCTA[0], collapsedShapePerCTA[1]);
-  }
   int elemBitWidth = shared.getElementBitWidth();
   int tileWidthBytes = shared.getSwizzlingByteWidth();
   int vec = 128 / elemBitWidth;
@@ -273,25 +251,9 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
 
   int tileRows = 8;
   int tileCols = 8 * tileWidthBytes / elemBitWidth;
-  bool isFp4Padded = false;
-  if (auto sharedMMALayout =
-          dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(shared)) {
-    if (sharedMMALayout.getFp4Padded()) {
-      isFp4Padded = true;
-    }
-  }
+  bool isFp4Padded = shared.getFp4Padded();
   int packingFactor = isFp4Padded ? 2 : 1;
 
-  if (collapsedShapePerCTA[1] * packingFactor < tileCols ||
-      collapsedShapePerCTA[0] < tileRows) {
-    llvm::errs() << "Illegal shared layout; expected collapsed shapePerCTA to "
-                    "be at least ["
-                 << tileRows << ", " << tileCols << "], collapsedShapePerCTA: ["
-                 << collapsedShapePerCTA[0] << ", " << collapsedShapePerCTA[1]
-                 << "]\n";
-    llvm::report_fatal_error("Illegal shared layout");
-  }
-
   std::vector<std::vector<int>> bases2D;
   for (int col = 1; col < tileCols; col *= 2) {
     if (isFp4Padded) {
@@ -309,30 +271,75 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
   for (int row = 1; row < tileRows; row *= 2) {
     if (disableSwizzle) {
       bases2D.push_back({row, 0});
-      continue;
-    }
-    if (isFp4Padded) {
+    } else if (isFp4Padded) {
       int colPadded = vec * ((row / perPhase) % maxPhase);
       int colPacked = colPadded / 16 * 8 + colPadded % 8;
       bases2D.push_back({row, colPacked});
     } else {
       bases2D.push_back({row, vec * ((row / perPhase) % maxPhase)});
     }
   }
+  auto outDimNames = standardOutDimNames(ctx, 2);
+  auto kRow = outDimNames[1];
+  auto kCol = outDimNames[0];
+  LinearLayout tileLayout =
+      LinearLayout({{S("offset"), bases2D}}, {kRow, kCol});
+  return tileLayout;
+}
+
+} // namespace
+
+LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
+                                       NVMMASharedEncodingAttr shared,
+                                       bool disableSwizzle) {
+  MLIRContext *ctx = shared.getContext();
+  int rank = shape.size();
+  auto shapePerCTA = getShapePerCTA(shared, shape);
+  if (rank == 1) {
+    // TODO: Not sure if this is correct.
+    return combineCtaCgaWithShape(
+        LinearLayout::identity1D(shapePerCTA[0], S("offset"), S("dim0")),
+        shared.getCTALayout(), shape);
+  }
+  // Construct bases for a the layout's 2-dimensional tile.
+  assert(rank >= 2);
+  int batchDims = rank - 2;
 
-  // Then distribute the remaining rows.
-  for (int row = tileRows; row < collapsedShapePerCTA[0]; row *= 2) {
-    bases2D.push_back({row, 0});
+  // Collapse all the outer dim into one. We will then create a layout for this
+  // shape and reshape it to the original shape.
+  std::array<int64_t, 2> collapsedShapePerCTA{shapePerCTA[batchDims],
+                                              shapePerCTA[batchDims + 1]};
+  for (int i = 0; i < batchDims; i++)
+    collapsedShapePerCTA[0] *= shapePerCTA[i];
+  if (shared.getTransposed()) {
+    std::swap(collapsedShapePerCTA[0], collapsedShapePerCTA[1]);
   }
 
+  auto tileLayout = getCoreMatrixLinearLayout(shared, disableSwizzle);
   auto outDimNames = standardOutDimNames(ctx, 2);
-  std::reverse(outDimNames.begin(), outDimNames.end());
-  LinearLayout tileLayout = LinearLayout({{S("offset"), bases2D}}, outDimNames);
-  // Expand the layout to convert the whole shape per CTA.
-  llvm::SmallDenseMap<StringAttr, int64_t> namedShape;
-  namedShape[outDimNames[0]] = collapsedShapePerCTA[0];
-  namedShape[outDimNames[1]] = collapsedShapePerCTA[1];
-  tileLayout = ensureLayoutNotSmallerThan(tileLayout, namedShape);
+  auto kRow = outDimNames[1];
+  auto kCol = outDimNames[0];
+  auto tileRows = tileLayout.getOutDimSize(kRow);
+  auto tileCols = tileLayout.getOutDimSize(kCol);
+
+  int packingFactor = shared.getFp4Padded() ? 2 : 1;
+  if (collapsedShapePerCTA[1] * packingFactor < tileCols ||
+      collapsedShapePerCTA[0] < tileRows) {
+    llvm::errs() << "Illegal shared layout; expected collapsed shapePerCTA to "
+                    "be at least ["
+                 << tileRows << ", " << (tileCols / packingFactor)
+                 << "], collapsedShapePerCTA: [" << collapsedShapePerCTA[0]
+                 << ", " << collapsedShapePerCTA[1] << "]\n";
+    llvm::report_fatal_error("Illegal shared layout");
+  }
+
+  // Distribute the remaining rows and cols.
+  auto kOffset = S("offset");
+  auto layout = tileLayout;
+  layout *= LinearLayout::identity1D(collapsedShapePerCTA[0] / tileRows,
+                                     kOffset, kRow);
+  layout *= LinearLayout::identity1D(collapsedShapePerCTA[1] / tileCols,
+                                     kOffset, kCol);
 
   // Reshape the layout to the N-D pre-transposed shape per CTA.
   SmallVector<int64_t> maybeTransposedShapePerCTA = shapePerCTA;
@@ -344,8 +351,7 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
                 maybeTransposedShapePerCTA.begin() + 1,
                 maybeTransposedShapePerCTA.end());
   }
-  auto reshapedLayout =
-      reshapeLayout(ctx, tileLayout, maybeTransposedShapePerCTA);
+  auto reshapedLayout = reshapeLayout(ctx, layout, maybeTransposedShapePerCTA);
 
   if (shared.getTransposed()) {
     SmallVector<int> order = {rank - 1};
diff --git a/python/test/unit/conftest.py b/python/test/unit/conftest.py
@@ -3,6 +3,7 @@
 import pathlib
 import pytest
 import tempfile
+from typing import Optional, Set
 
 
 def pytest_configure(config):
@@ -76,20 +77,50 @@ def fresh_triton_cache():
             os.environ.pop("TRITON_CACHE_DIR", None)
 
 
-@pytest.fixture
-def fresh_knobs(request, monkeypatch):
+def _fresh_knobs_impl(monkeypatch, skipped_attr: Optional[Set[str]] = None):
     from triton import knobs
+
+    if skipped_attr is None:
+        skipped_attr = set()
+
     knobs_map = {
         name: knobset
         for name, knobset in knobs.__dict__.items()
-        if isinstance(knobset, knobs.base_knobs) and knobset != knobs.base_knobs
+        if isinstance(knobset, knobs.base_knobs) and knobset != knobs.base_knobs and name not in skipped_attr
     }
-    try:
+
+    def fresh_function():
         for name, knobset in knobs_map.items():
             setattr(knobs, name, knobset.copy().reset())
             for knob in knobset.knob_descriptors.values():
                 monkeypatch.delenv(knob.key, raising=False)
-        yield knobs
-    finally:
+        return knobs
+
+    def reset_function():
         for name, knobset in knobs_map.items():
             setattr(knobs, name, knobset)
+
+    return fresh_function, reset_function
+
+
+@pytest.fixture
+def fresh_knobs(monkeypatch):
+    fresh_function, reset_function = _fresh_knobs_impl(monkeypatch)
+    try:
+        yield fresh_function()
+    finally:
+        reset_function()
+
+
+@pytest.fixture
+def fresh_knobs_except_libraries(monkeypatch):
+    """
+    A variant of `fresh_knobs` that keeps library path
+    information from the environment as these may be
+    needed to successfully compile kernels.
+    """
+    fresh_function, reset_function = _fresh_knobs_impl(monkeypatch, skipped_attr={"build", "nvidia", "amd"})
+    try:
+        yield fresh_function()
+    finally:
+        reset_function()
diff --git a/python/test/unit/runtime/test_compilation_listener.py b/python/test/unit/runtime/test_compilation_listener.py
@@ -17,7 +17,7 @@ def cumsum_kernel(ptr):
     tl.store(block, tl.cumsum(x, 0))
 
 
-def test_compile_stats(device: str, fresh_knobs: Any, fresh_triton_cache: str) -> None:
+def test_compile_stats(device: str, fresh_knobs_except_libraries: Any, fresh_triton_cache: str) -> None:
     captured: Union[tuple[Union[ASTSource, IRSource], dict[str, Any], CompileTimes, bool], None] = None
 
     def compile_listener(src: Union[ASTSource, IRSource], metadata: dict[str, Any], times: CompileTimes,
@@ -26,7 +26,7 @@ def compile_listener(src: Union[ASTSource, IRSource], metadata: dict[str, Any],
         assert captured is None
         captured = (src, metadata, times, cache_hit)
 
-    fresh_knobs.compilation.listener = compile_listener
+    fresh_knobs_except_libraries.compilation.listener = compile_listener
 
     x = torch.randn(4, device=device)
     cumsum_kernel[(1, )](x)
diff --git a/python/test/unit/test_knobs.py b/python/test/unit/test_knobs.py
@@ -2,6 +2,7 @@
 import pytest
 import shutil
 import triton
+from triton._internal_testing import is_hip
 
 from pathlib import Path
 
@@ -136,6 +137,7 @@ def test_read_env(truthy, falsey, fresh_knobs, monkeypatch):
     assert fresh_knobs.cache.override_dir == "/tmp/triton_home/.triton/override"
 
     from triton.runtime.cache import FileCacheManager
+
     assert fresh_knobs.cache.manager_class == FileCacheManager
 
     assert fresh_knobs.build.backend_dirs == {"/tmp/cuda/crt", "/tmp/cuda/rt"}
@@ -216,8 +218,12 @@ class TestManagerClass(FileCacheManager):
     assert fresh_knobs.cache.manager_class == FileCacheManager
 
 
+@pytest.mark.skipif(
+    is_hip(),
+    reason="PTXAS is not installed on AMD",
+)
 def test_nvidia_tool(fresh_knobs, tmp_path, monkeypatch):
-    triton_root = Path(__file__).parent.parent.parent / "triton"
+    triton_root = Path(fresh_knobs.__file__).parent
     default_ptxas = triton_root / "backends/nvidia/bin/ptxas"
 
     assert default_ptxas.exists()
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
@@ -46,7 +46,7 @@ def __set_name__(self, objclass: Type[object], name: str) -> None:
 
     def __get__(self, obj: Optional[object], objclass: Optional[Type[object]]) -> GetType:
         if obj is None:
-            raise AttributeError("Cannot access {type(self)} on non-instance")
+            raise AttributeError(f"Cannot access {type(self)} on non-instance")
 
         if self.name in obj.__dict__:
             return self.transform(obj.__dict__[self.name])
@@ -311,7 +311,7 @@ def copy(self: knobs_type) -> knobs_type:
         return res
 
     def reset(self: knobs_type) -> knobs_type:
-        for knob in self.knobs.keys():
+        for knob in self.knob_descriptors.keys():
             delattr(self, knob)
         return self
 
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -277,8 +277,8 @@ def make_ttgir(mod, metadata, opt, capability):
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
         if capability // 10 >= 9:
-            nvidia.passes.ttnvgpuir.add_fence_insertion(pm)
             nvidia.passes.ttnvgpuir.add_tma_lowering(pm)
+            nvidia.passes.ttnvgpuir.add_fence_insertion(pm)
         passes.common.add_canonicalizer(pm)
         pm.run(mod)
         metadata["cluster_dims"] = (cluster_info.clusterDimX, cluster_info.clusterDimY, cluster_info.clusterDimZ)