intel
diff --git a/‎.github/workflows/build-macos.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build-macos.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/ci.yml
Lines changed: 0 additions & 8 deletions b/‎.github/workflows/ci.yml
Lines changed: 0 additions & 8 deletions
diff --git a/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/getting-started/installation.rst
Lines changed: 12 additions & 19 deletions b/‎docs/getting-started/installation.rst
Lines changed: 12 additions & 19 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td
Lines changed: 8 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td
Lines changed: 8 additions & 2 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h
Lines changed: 0 additions & 16 deletions b/‎include/triton/Tools/LayoutUtils.h
Lines changed: 0 additions & 16 deletions
diff --git a/‎lib/Analysis/Utility.cpp
Lines changed: 46 additions & 14 deletions b/‎lib/Analysis/Utility.cpp
Lines changed: 46 additions & 14 deletions
diff --git a/‎lib/Tools/LayoutUtils.cpp
Lines changed: 0 additions & 134 deletions b/‎lib/Tools/LayoutUtils.cpp
Lines changed: 0 additions & 134 deletions
diff --git a/‎python/triton/knobs.py
Lines changed: 2 additions & 0 deletions b/‎python/triton/knobs.py
Lines changed: 2 additions & 0 deletions
@@ -18,6 +18,7 @@ jobs:
     timeout-minutes: 60
     env:
       RUNNER_TYPE: ${{ matrix.runner[0] }}
+      TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
     name: Build MacOS
     steps:
       - name: Checkout
@@ -97,7 +98,7 @@ jobs:
         env:
           TRITON_BUILD_WITH_O1: "true"
           # macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3
-          # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
+          # https://docs.github.com/en/actions/reference/github-hosted-runners-reference#standard-github-hosted-runners-for-public-repositories
           MAX_JOBS: 3
           # Add elapsed time in seconds to ninja status to monitor where build stalls
           NINJA_STATUS: "[%f/%t, %es elapsed] "
 
@@ -12,14 +12,6 @@ concurrency:
   group: ${{ github.ref }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 permissions: read-all
-env:
-  TRITON_BUILD_WITH_CCACHE: "true"
-  TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
-  TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
-  TRITON_DISABLE_LINE_INFO: 1
-  PROTON_SKIP_PC_SAMPLING_TEST: 1
-  PYTHON: "python3"
-  CCACHE_COMPRESS: "true"
 
 jobs:
 
 
@@ -49,7 +49,6 @@ jobs:
           echo "llvm=$(cat $llvm_file | cut -c 1-8)" >> $GITHUB_OUTPUT
           echo "nvidia=$(sha256sum $nvidia_file | cut -d ' ' -f 1)" >> $GITHUB_OUTPUT
           echo "json=$(cat $json_file)" >> $GITHUB_OUTPUT
-          echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
         shell: bash
       - name: Cache build dependencies
         uses: actions/cache@v4
 
@@ -14,14 +14,7 @@ You can install the latest stable release of Triton from pip:
 
       pip install triton
 
-Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9.
-
-And the latest nightly release:
-
-.. code-block:: bash
-
-      pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
-
+Binary wheels are available for CPython 3.9-3.13.
 
 -----------
 From Source
@@ -35,25 +28,25 @@ You can install the Python package from source by running the following commands
 
 .. code-block:: bash
 
-      git clone https://github.com/triton-lang/triton.git;
-      cd triton/python;
-      pip install ninja cmake wheel; # build-time dependencies
+      git clone https://github.com/triton-lang/triton.git
+      cd triton
+
+      pip install -r python/requirements.txt # build-time dependencies
       pip install -e .
 
 Note that, if llvm is not present on your system, the setup.py script will download the official LLVM static libraries and link against that.
 
 For building with a custom LLVM, review the `Building with a custom LLVM <https://github.com/triton-lang/triton?tab=readme-ov-file#building-with-a-custom-llvm>`_ section on Github.
 
-You can then test your installation by running the unit tests:
+You can then test your installation by running the tests:
 
 .. code-block:: bash
 
-      pip install -e '.[tests]'
-      pytest -vs test/unit/
+      # One-time setup
+      make dev-install
 
-and the benchmarks
-
-.. code-block:: bash
+      # To run all tests (requires a GPU)
+      make test
 
-      cd bench
-      python -m run --with-plots --result-dir /tmp/triton-bench
+      # Or, to run tests without a GPU
+      make test-nogpu
@@ -383,8 +383,14 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [
     }];
 }
 
-def TT_AtomicCASOp : TT_Op<"atomic_cas", [SameOperandsAndResultShape,
-                                          SameOperandsAndResultEncoding]> {
+def TT_AtomicCASOp : TT_Op<"atomic_cas", [
+  SameOperandsAndResultShape,
+  SameOperandsAndResultEncoding,
+  TypesMatchWith<"ptr type matches cmp type", "cmp", "ptr",
+                  "getPointerTypeSameShape($_self)">,
+  TypesMatchWith<"ptr type matches value type", "val", "ptr",
+                  "getPointerTypeSameShape($_self)">
+]> {
     let summary = "atomic cas";
 
     let description = [{
 
@@ -148,22 +148,6 @@ LinearLayout reshapeLayout(MLIRContext *ctx, LinearLayout layout,
 // order.
 LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order);
 
-// Reorders the in and out dimensions to match another layout.
-LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y);
-
-// For two layouts, `src` and `dst`, that differ only by a permutation of
-// their basis vectors, return a permutation layout `P` which satisfies
-// `dst` \circ `P` = `src`.
-//
-// The returned layout has the following properties:
-// - The orders of the input and output dimensions of `P` match the order of the
-//   input dimensions of `src`.
-// - Prioritizes making zero (broadcasting) vectors fixed-points of the
-//   permutation. I.e., if a vector is zero in both `src` and `dst` for the same
-//   input coordinate, it maps to itself under `P`.
-LinearLayout basisPermutationLayout(const LinearLayout &src,
-                                    const LinearLayout &dst);
-
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LAYOUTUTILS_H
@@ -284,10 +284,11 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   // subsequences of consecutive lane bits from cycles involving both bit types.
   // Further explanation of this method is below.
   //
-  // The decomposition is implemented by building bases for the layouts `pReg`
-  // and `pLane` by walking the cycles of `P`, a permutation layout returned by
-  // `basisPermutationLayout(S, T)` which accepts two layouts `S` and `T` which
-  // differ only by a permutation of their basis vectors.
+  // The decomposition is performed in two stages. First, we compute the
+  // permutation matrix `P` by using `invertAndCompose` to generate a skeleton
+  // and then fill in any zero columns. Second, we walk the cycles of `P` to
+  // factor out mixed transpositions to build `mixedTranspositions`, `pReg`, and
+  // `pLane`.
 
   // We remove any broadcasting in the register dimensions of the layouts before
   // forming the permutation `P` as the components of the decomposition directly
@@ -316,9 +317,10 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
   int nRegBases = std::max(nSrcRegBases, nDstRegBases);
   int nLaneBases = std::max(nSrcLaneBases, nDstLaneBases);
   // Restrict attention to the input dimensions which matter.
+  SmallVector<StringAttr> inDimNames{kReg, kLane};
   auto outDimNames = llvm::to_vector(srcLayout.getOutDimNames());
-  auto S = srcLayout.sublayout({kReg, kLane}, outDimNames);
-  auto T = dstLayout.sublayout({kReg, kLane}, outDimNames);
+  auto S = srcLayout.sublayout(inDimNames, outDimNames);
+  auto T = dstLayout.sublayout(inDimNames, outDimNames);
   // Conditionally pad.
   if (nSrcRegBases != nDstRegBases || nSrcLaneBases != nDstLaneBases) {
     auto padWithZeros = [&](const LinearLayout &ll) {
@@ -340,10 +342,41 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
     T = padWithZeros(T);
   }
 
-  // Now that `S` and `T` have the same basis vectors, we compute the
-  // permutation `P` which transforms `S` into `T`.
-  auto P = basisPermutationLayout(S, T);
-  auto &pBases = P.getBases();
+  // Flatten outs for ease of building `P`, and reorder outs as flattening
+  // depends on output dimension order.
+  if (outDimNames != llvm::to_vector(T.getOutDimNames()))
+    T = T.transposeOuts(outDimNames);
+  S = S.flattenOuts();
+  T = T.flattenOuts();
+
+  // We compute T^transpose \circ S, which serves as a skeleton for `P`, then
+  // fill in zero columns, prioritizing producing fixed points. As we only need
+  // the basis vectors of `P`, we never actually produce the LinearLayout.
+  auto pBases = S.invertAndCompose(T).getBases();
+
+  // Find the common and uncommon zeros of S and T
+  SmallVector<std::pair<int32_t, int32_t>> srcFreeZeros;
+  SmallVector<std::pair<int32_t, int32_t>> dstFreeZeros;
+  for (auto [dimIdx, dim] : llvm::enumerate(inDimNames)) {
+    for (int inIdx = 0; inIdx < S.getInDimSizeLog2(dim); ++inIdx) {
+      int sVal = S.getBasis(dim, inIdx)[0];
+      int tVal = T.getBasis(dim, inIdx)[0];
+      if (sVal == 0 && tVal == 0) {
+        pBases[dim][inIdx][dimIdx] = 1 << inIdx;
+      } else if (sVal == 0) {
+        srcFreeZeros.emplace_back(dimIdx, inIdx);
+      } else if (tVal == 0) {
+        dstFreeZeros.emplace_back(dimIdx, inIdx);
+      }
+    }
+  }
+  // Fill in non-fixed-point zero vectors
+  for (auto [srcZeroLoc, dstZeroLoc] : llvm::zip(srcFreeZeros, dstFreeZeros)) {
+    auto [srcDimIdx, srcIdx] = srcZeroLoc;
+    auto [dstDimIdx, dstIdx] = dstZeroLoc;
+    auto inDim = inDimNames[srcDimIdx];
+    pBases[inDim][srcIdx][dstDimIdx] = 1 << dstIdx;
+  }
 
   // We walk the cycles of `P` to build the bases for `pReg` and `pLane` while
   // factoring out mixed transpositions from cycles that include both register
@@ -361,9 +394,8 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
     return (dim == kReg) ? index : nRegBases + index;
   };
 
-  auto dimNames = llvm::to_vector(P.getInDimNames());
-  for (auto dim : dimNames) {
-    int inDimSize = P.getInDimSizeLog2(dim);
+  for (auto dim : inDimNames) {
+    int inDimSize = S.getInDimSizeLog2(dim);
     for (int i = 0; i < inDimSize; ++i) {
       if (visited.test(flatIdx(dim, i)))
         continue;
@@ -399,7 +431,7 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
         int32_t nextIdx;
         for (auto [nextDimIdx, nextVal] : llvm::enumerate(nextVec)) {
           if (nextVal != 0) {
-            nextDim = dimNames[nextDimIdx];
+            nextDim = inDimNames[nextDimIdx];
             nextIdx = llvm::Log2_32(nextVal);
           }
         }
 
@@ -1,6 +1,5 @@
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/GenericSwizzling.h"
-#include "llvm/ADT/SmallSet.h"
 
 namespace mlir::triton {
 
@@ -447,137 +446,4 @@ LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order) {
                       to_vector(layout.getOutDimNames()));
 }
 
-LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y) {
-  // This will check that the names are the same up to permutation, and
-  // apply the necessary permutation:
-  auto x2 = x.transposeOuts(llvm::to_vector(y.getOutDimNames()));
-  auto x3 = x2.transposeIns(llvm::to_vector(y.getInDimNames()));
-  return x3;
-}
-
-LinearLayout basisPermutationLayout(const LinearLayout &src,
-                                    const LinearLayout &dst) {
-  // This function computes a permutation layout `P` which satisfies the
-  // property `src = dst \circ P`. It requires that the multiset of basis
-  // vectors for each of `src` and `dst` agree and that the nonzero values in
-  // each of the multisets are unique. I.e., broadcasting is allowed in either
-  // layout so long as the degree of broadcasting (the number of zero basis
-  // vectors) is the same between the two layouts.
-  //
-  // The orders of the input and output dimensions of `P` are set to be the
-  // order of the input dimensions of `src`.
-  //
-  // The mapping of broadcasting basis vectors prioritizes keeping such vectors
-  // as fixed points of the permutation. I.e., if `src[inDim][i]` and
-  // `dst[inDim][i]` are zero vectors, then `P[inDim][i][inDimIdx] == 1 << i`,
-  // where `inDimIdx` is the index of `inDim` in `src`. Otherwise, they are
-  // paired according to their order of appearance in the two layouts, again
-  // following the order of the input dimensions of `src`.
-  //
-  // The algorithm first performs a linear scan over the columns of `dst` and
-  // `src` to build a map from ('flattened') basis vectors to the input
-  // vectors of `dst` while tracking the fixed-point zero vectors and 'free'
-  // zero vectors. It then performs a second linear scan over `src` to build
-  // the basis of `P`.
-
-  // Check that the input and output dimensions are equal up to ordering.
-  auto srcInDims = src.getInDimNames();
-  assert(std::is_permutation(srcInDims.begin(), srcInDims.end(),
-                             dst.getInDimNames().begin()) &&
-         "Layouts must have same input dimensions");
-  for (auto inDim : srcInDims) {
-    assert(src.getInDimSize(inDim) == dst.getInDimSize(inDim) &&
-           "Layouts must have same input dimension sizes");
-  }
-  auto srcOutDims = src.getOutDims();
-  assert(std::is_permutation(srcOutDims.begin(), srcOutDims.end(),
-                             dst.getOutDims().begin()) &&
-         "Layouts must have same output dimensions and dimension sizes");
-
-  auto srcFlat = src.flattenOuts();
-  // Reorder the output dimensions of `dst` if necessary before flattening, as
-  // flattening depends on the order.
-  LinearLayout dstFlat;
-  if (!llvm::equal(src.getOutDims(), dst.getOutDims())) {
-    auto temp = dst.transposeOuts(llvm::to_vector(src.getOutDimNames()));
-    dstFlat = temp.flattenOuts();
-  } else {
-    dstFlat = dst.flattenOuts();
-  }
-
-  // Populate the map of flattened values to dst inputs and track zero vectors.
-  // The `commonZeros` become fixed-points of `P`, while the 'free' zeros are
-  // later paired with one another.
-  DenseMap<int32_t, std::pair<StringAttr, int32_t>> valToDstInput;
-  llvm::SmallDenseMap<StringAttr, llvm::SmallSet<int32_t, 4>> commonZeros;
-  SmallVector<std::pair<StringAttr, int32_t>> dstFreeZeros;
-  size_t srcFreeZerosCount = 0;
-
-  // We traverse the input dimensions according to their order in `src` so that
-  // 'free' zero vectors for a given input dimension in `src` prefer to map to
-  // 'free' zero vectors in the same dimension in `dst.
-  for (auto inDim : srcInDims) {
-    int inDimSize = dstFlat.getInDimSizeLog2(inDim);
-    for (int i = 0; i < inDimSize; ++i) {
-      int32_t dstVal = dstFlat.getBasis(inDim, i)[0];
-      int32_t srcVal = srcFlat.getBasis(inDim, i)[0];
-      if (dstVal == 0 && srcVal == 0) {
-        commonZeros[inDim].insert(i);
-      } else if (dstVal == 0) {
-        dstFreeZeros.emplace_back(inDim, i);
-      } else {
-        auto [it, success] = valToDstInput.try_emplace(dstVal, inDim, i);
-        assert(success && "Found duplicate nonzero vectors in dst layout");
-        if (srcVal == 0)
-          ++srcFreeZerosCount;
-      }
-    }
-  }
-  assert(srcFreeZerosCount == dstFreeZeros.size() &&
-         "src and dst layouts have differing number of zero bases");
-
-  // Build the basis vectors for the permutation layout `P`.
-  // For each basis vector in `src`, determine its target in `dst`:
-  // - If the vector is nonzero, find the corresponding vector in `dst`.
-  // - If it is a zero vector common to both layouts, set it as a fixed-point.
-  // - Otherwise, pair it with the next available free zero of `dst`.
-  LinearLayout::BasesT pBases;
-  size_t numDims = llvm::size(srcInDims);
-  size_t freeZeroIdx = 0;
-  for (auto inDim : srcInDims) {
-    int inDimSize = srcFlat.getInDimSizeLog2(inDim);
-    auto &inDimBases = pBases[inDim];
-    inDimBases.reserve(inDimSize);
-    for (int i = 0; i < inDimSize; ++i)
-      inDimBases.emplace_back(numDims, 0);
-
-    for (int inIdx = 0; inIdx < inDimSize; ++inIdx) {
-      int32_t val = srcFlat.getBasis(inDim, inIdx)[0];
-      std::pair<StringAttr, int32_t> dstTarget;
-
-      if (val != 0) {
-        auto it = valToDstInput.find(val);
-        assert(it != valToDstInput.end() && "src basis not found in dst");
-        dstTarget = it->second;
-      } else if (commonZeros.lookup(inDim).count(inIdx)) {
-        dstTarget = {inDim, inIdx};
-      } else {
-        dstTarget = dstFreeZeros[freeZeroIdx++];
-      }
-
-      // Build the basis vector for `P` using the ordering on output dimensions
-      // induced by the ordering on the input dimensions of `src`.
-      auto it = llvm::find(srcInDims, dstTarget.first);
-      int outDimIdx = std::distance(srcInDims.begin(), it);
-      inDimBases[inIdx][outDimIdx] = 1 << dstTarget.second;
-    }
-  }
-  // Declare the ordering on the `outDims` of `P` to be that of `srcInDims`.
-  SmallVector<std::pair<StringAttr, int32_t>> outDims;
-  for (auto outDim : srcInDims)
-    outDims.emplace_back(outDim, srcFlat.getInDimSize(outDim));
-
-  return LinearLayout(std::move(pBases), outDims, /*requireSurjective=*/true);
-}
-
 } // namespace mlir::triton
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import functools
 import importlib
 import os
 import re
@@ -171,6 +172,7 @@ class NvidiaTool:
     version: str
 
     @staticmethod
+    @functools.lru_cache
     def from_path(path: str) -> Optional[NvidiaTool]:
         try:
             result = subprocess.check_output([path, "--version"], stderr=subprocess.STDOUT)