Skip to content

Commit 2bd6607

Browse files
committed
Merge commit '167ed2866c56c0b5e168d7f02dddec51c8ff4c5b'
2 parents c966e3b + 167ed28 commit 2bd6607

File tree

20 files changed

+161
-264
lines changed

20 files changed

+161
-264
lines changed

.github/workflows/build-macos.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ jobs:
1818
timeout-minutes: 60
1919
env:
2020
RUNNER_TYPE: ${{ matrix.runner[0] }}
21+
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
2122
name: Build MacOS
2223
steps:
2324
- name: Checkout
@@ -97,7 +98,7 @@ jobs:
9798
env:
9899
TRITON_BUILD_WITH_O1: "true"
99100
# macos-latest has 3 vcpus and 7GB DRAM, to save memory we limit the number of jobs to 3
100-
# https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners/about-github-hosted-runners#standard-github-hosted-runners-for-public-repositories
101+
# https://docs.github.com/en/actions/reference/github-hosted-runners-reference#standard-github-hosted-runners-for-public-repositories
101102
MAX_JOBS: 3
102103
# Add elapsed time in seconds to ninja status to monitor where build stalls
103104
NINJA_STATUS: "[%f/%t, %es elapsed] "

.github/workflows/ci.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,6 @@ concurrency:
1212
group: ${{ github.ref }}
1313
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
1414
permissions: read-all
15-
env:
16-
TRITON_BUILD_WITH_CCACHE: "true"
17-
TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
18-
TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
19-
TRITON_DISABLE_LINE_INFO: 1
20-
PROTON_SKIP_PC_SAMPLING_TEST: 1
21-
PYTHON: "python3"
22-
CCACHE_COMPRESS: "true"
2315

2416
jobs:
2517

.github/workflows/integration-tests-nvidia.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ jobs:
4949
echo "llvm=$(cat $llvm_file | cut -c 1-8)" >> $GITHUB_OUTPUT
5050
echo "nvidia=$(sha256sum $nvidia_file | cut -d ' ' -f 1)" >> $GITHUB_OUTPUT
5151
echo "json=$(cat $json_file)" >> $GITHUB_OUTPUT
52-
echo "datetime=$(date -u -Iseconds)" >> $GITHUB_OUTPUT
5352
shell: bash
5453
- name: Cache build dependencies
5554
uses: actions/cache@v4

docs/getting-started/installation.rst

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,7 @@ You can install the latest stable release of Triton from pip:
1414
1515
pip install triton
1616
17-
Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9.
18-
19-
And the latest nightly release:
20-
21-
.. code-block:: bash
22-
23-
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
24-
17+
Binary wheels are available for CPython 3.9-3.13.
2518

2619
-----------
2720
From Source
@@ -35,25 +28,25 @@ You can install the Python package from source by running the following commands
3528

3629
.. code-block:: bash
3730
38-
git clone https://github.com/triton-lang/triton.git;
39-
cd triton/python;
40-
pip install ninja cmake wheel; # build-time dependencies
31+
git clone https://github.com/triton-lang/triton.git
32+
cd triton
33+
34+
pip install -r python/requirements.txt # build-time dependencies
4135
pip install -e .
4236
4337
Note that, if llvm is not present on your system, the setup.py script will download the official LLVM static libraries and link against that.
4438

4539
For building with a custom LLVM, review the `Building with a custom LLVM <https://github.com/triton-lang/triton?tab=readme-ov-file#building-with-a-custom-llvm>`_ section on Github.
4640

47-
You can then test your installation by running the unit tests:
41+
You can then test your installation by running the tests:
4842

4943
.. code-block:: bash
5044
51-
pip install -e '.[tests]'
52-
pytest -vs test/unit/
45+
# One-time setup
46+
make dev-install
5347
54-
and the benchmarks
55-
56-
.. code-block:: bash
48+
# To run all tests (requires a GPU)
49+
make test
5750
58-
cd bench
59-
python -m run --with-plots --result-dir /tmp/triton-bench
51+
# Or, to run tests without a GPU
52+
make test-nogpu

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,14 @@ def TT_AtomicRMWOp : TT_Op<"atomic_rmw", [
383383
}];
384384
}
385385

386-
def TT_AtomicCASOp : TT_Op<"atomic_cas", [SameOperandsAndResultShape,
387-
SameOperandsAndResultEncoding]> {
386+
def TT_AtomicCASOp : TT_Op<"atomic_cas", [
387+
SameOperandsAndResultShape,
388+
SameOperandsAndResultEncoding,
389+
TypesMatchWith<"ptr type matches cmp type", "cmp", "ptr",
390+
"getPointerTypeSameShape($_self)">,
391+
TypesMatchWith<"ptr type matches value type", "val", "ptr",
392+
"getPointerTypeSameShape($_self)">
393+
]> {
388394
let summary = "atomic cas";
389395

390396
let description = [{

include/triton/Tools/LayoutUtils.h

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -148,22 +148,6 @@ LinearLayout reshapeLayout(MLIRContext *ctx, LinearLayout layout,
148148
// order.
149149
LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order);
150150

151-
// Reorders the in and out dimensions to match another layout.
152-
LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y);
153-
154-
// For two layouts, `src` and `dst`, that differ only by a permutation of
155-
// their basis vectors, return a permutation layout `P` which satisfies
156-
// `dst` \circ `P` = `src`.
157-
//
158-
// The returned layout has the following properties:
159-
// - The orders of the input and output dimensions of `P` match the order of the
160-
// input dimensions of `src`.
161-
// - Prioritizes making zero (broadcasting) vectors fixed-points of the
162-
// permutation. I.e., if a vector is zero in both `src` and `dst` for the same
163-
// input coordinate, it maps to itself under `P`.
164-
LinearLayout basisPermutationLayout(const LinearLayout &src,
165-
const LinearLayout &dst);
166-
167151
} // namespace mlir::triton
168152

169153
#endif // TRITON_TOOLS_LAYOUTUTILS_H

lib/Analysis/Utility.cpp

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -284,10 +284,11 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
284284
// subsequences of consecutive lane bits from cycles involving both bit types.
285285
// Further explanation of this method is below.
286286
//
287-
// The decomposition is implemented by building bases for the layouts `pReg`
288-
// and `pLane` by walking the cycles of `P`, a permutation layout returned by
289-
// `basisPermutationLayout(S, T)` which accepts two layouts `S` and `T` which
290-
// differ only by a permutation of their basis vectors.
287+
// The decomposition is performed in two stages. First, we compute the
288+
// permutation matrix `P` by using `invertAndCompose` to generate a skeleton
289+
// and then fill in any zero columns. Second, we walk the cycles of `P` to
290+
// factor out mixed transpositions to build `mixedTranspositions`, `pReg`, and
291+
// `pLane`.
291292

292293
// We remove any broadcasting in the register dimensions of the layouts before
293294
// forming the permutation `P` as the components of the decomposition directly
@@ -316,9 +317,10 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
316317
int nRegBases = std::max(nSrcRegBases, nDstRegBases);
317318
int nLaneBases = std::max(nSrcLaneBases, nDstLaneBases);
318319
// Restrict attention to the input dimensions which matter.
320+
SmallVector<StringAttr> inDimNames{kReg, kLane};
319321
auto outDimNames = llvm::to_vector(srcLayout.getOutDimNames());
320-
auto S = srcLayout.sublayout({kReg, kLane}, outDimNames);
321-
auto T = dstLayout.sublayout({kReg, kLane}, outDimNames);
322+
auto S = srcLayout.sublayout(inDimNames, outDimNames);
323+
auto T = dstLayout.sublayout(inDimNames, outDimNames);
322324
// Conditionally pad.
323325
if (nSrcRegBases != nDstRegBases || nSrcLaneBases != nDstLaneBases) {
324326
auto padWithZeros = [&](const LinearLayout &ll) {
@@ -340,10 +342,41 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
340342
T = padWithZeros(T);
341343
}
342344

343-
// Now that `S` and `T` have the same basis vectors, we compute the
344-
// permutation `P` which transforms `S` into `T`.
345-
auto P = basisPermutationLayout(S, T);
346-
auto &pBases = P.getBases();
345+
// Flatten outs for ease of building `P`, and reorder outs as flattening
346+
// depends on output dimension order.
347+
if (outDimNames != llvm::to_vector(T.getOutDimNames()))
348+
T = T.transposeOuts(outDimNames);
349+
S = S.flattenOuts();
350+
T = T.flattenOuts();
351+
352+
// We compute T^transpose \circ S, which serves as a skeleton for `P`, then
353+
// fill in zero columns, prioritizing producing fixed points. As we only need
354+
// the basis vectors of `P`, we never actually produce the LinearLayout.
355+
auto pBases = S.invertAndCompose(T).getBases();
356+
357+
// Find the common and uncommon zeros of S and T
358+
SmallVector<std::pair<int32_t, int32_t>> srcFreeZeros;
359+
SmallVector<std::pair<int32_t, int32_t>> dstFreeZeros;
360+
for (auto [dimIdx, dim] : llvm::enumerate(inDimNames)) {
361+
for (int inIdx = 0; inIdx < S.getInDimSizeLog2(dim); ++inIdx) {
362+
int sVal = S.getBasis(dim, inIdx)[0];
363+
int tVal = T.getBasis(dim, inIdx)[0];
364+
if (sVal == 0 && tVal == 0) {
365+
pBases[dim][inIdx][dimIdx] = 1 << inIdx;
366+
} else if (sVal == 0) {
367+
srcFreeZeros.emplace_back(dimIdx, inIdx);
368+
} else if (tVal == 0) {
369+
dstFreeZeros.emplace_back(dimIdx, inIdx);
370+
}
371+
}
372+
}
373+
// Fill in non-fixed-point zero vectors
374+
for (auto [srcZeroLoc, dstZeroLoc] : llvm::zip(srcFreeZeros, dstFreeZeros)) {
375+
auto [srcDimIdx, srcIdx] = srcZeroLoc;
376+
auto [dstDimIdx, dstIdx] = dstZeroLoc;
377+
auto inDim = inDimNames[srcDimIdx];
378+
pBases[inDim][srcIdx][dstDimIdx] = 1 << dstIdx;
379+
}
347380

348381
// We walk the cycles of `P` to build the bases for `pReg` and `pLane` while
349382
// factoring out mixed transpositions from cycles that include both register
@@ -361,9 +394,8 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
361394
return (dim == kReg) ? index : nRegBases + index;
362395
};
363396

364-
auto dimNames = llvm::to_vector(P.getInDimNames());
365-
for (auto dim : dimNames) {
366-
int inDimSize = P.getInDimSizeLog2(dim);
397+
for (auto dim : inDimNames) {
398+
int inDimSize = S.getInDimSizeLog2(dim);
367399
for (int i = 0; i < inDimSize; ++i) {
368400
if (visited.test(flatIdx(dim, i)))
369401
continue;
@@ -399,7 +431,7 @@ getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
399431
int32_t nextIdx;
400432
for (auto [nextDimIdx, nextVal] : llvm::enumerate(nextVec)) {
401433
if (nextVal != 0) {
402-
nextDim = dimNames[nextDimIdx];
434+
nextDim = inDimNames[nextDimIdx];
403435
nextIdx = llvm::Log2_32(nextVal);
404436
}
405437
}

lib/Tools/LayoutUtils.cpp

Lines changed: 0 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "triton/Tools/LayoutUtils.h"
22
#include "triton/Tools/GenericSwizzling.h"
3-
#include "llvm/ADT/SmallSet.h"
43

54
namespace mlir::triton {
65

@@ -447,137 +446,4 @@ LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order) {
447446
to_vector(layout.getOutDimNames()));
448447
}
449448

450-
LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y) {
451-
// This will check that the names are the same up to permutation, and
452-
// apply the necessary permutation:
453-
auto x2 = x.transposeOuts(llvm::to_vector(y.getOutDimNames()));
454-
auto x3 = x2.transposeIns(llvm::to_vector(y.getInDimNames()));
455-
return x3;
456-
}
457-
458-
LinearLayout basisPermutationLayout(const LinearLayout &src,
459-
const LinearLayout &dst) {
460-
// This function computes a permutation layout `P` which satisfies the
461-
// property `src = dst \circ P`. It requires that the multiset of basis
462-
// vectors for each of `src` and `dst` agree and that the nonzero values in
463-
// each of the multisets are unique. I.e., broadcasting is allowed in either
464-
// layout so long as the degree of broadcasting (the number of zero basis
465-
// vectors) is the same between the two layouts.
466-
//
467-
// The orders of the input and output dimensions of `P` are set to be the
468-
// order of the input dimensions of `src`.
469-
//
470-
// The mapping of broadcasting basis vectors prioritizes keeping such vectors
471-
// as fixed points of the permutation. I.e., if `src[inDim][i]` and
472-
// `dst[inDim][i]` are zero vectors, then `P[inDim][i][inDimIdx] == 1 << i`,
473-
// where `inDimIdx` is the index of `inDim` in `src`. Otherwise, they are
474-
// paired according to their order of appearance in the two layouts, again
475-
// following the order of the input dimensions of `src`.
476-
//
477-
// The algorithm first performs a linear scan over the columns of `dst` and
478-
// `src` to build a map from ('flattened') basis vectors to the input
479-
// vectors of `dst` while tracking the fixed-point zero vectors and 'free'
480-
// zero vectors. It then performs a second linear scan over `src` to build
481-
// the basis of `P`.
482-
483-
// Check that the input and output dimensions are equal up to ordering.
484-
auto srcInDims = src.getInDimNames();
485-
assert(std::is_permutation(srcInDims.begin(), srcInDims.end(),
486-
dst.getInDimNames().begin()) &&
487-
"Layouts must have same input dimensions");
488-
for (auto inDim : srcInDims) {
489-
assert(src.getInDimSize(inDim) == dst.getInDimSize(inDim) &&
490-
"Layouts must have same input dimension sizes");
491-
}
492-
auto srcOutDims = src.getOutDims();
493-
assert(std::is_permutation(srcOutDims.begin(), srcOutDims.end(),
494-
dst.getOutDims().begin()) &&
495-
"Layouts must have same output dimensions and dimension sizes");
496-
497-
auto srcFlat = src.flattenOuts();
498-
// Reorder the output dimensions of `dst` if necessary before flattening, as
499-
// flattening depends on the order.
500-
LinearLayout dstFlat;
501-
if (!llvm::equal(src.getOutDims(), dst.getOutDims())) {
502-
auto temp = dst.transposeOuts(llvm::to_vector(src.getOutDimNames()));
503-
dstFlat = temp.flattenOuts();
504-
} else {
505-
dstFlat = dst.flattenOuts();
506-
}
507-
508-
// Populate the map of flattened values to dst inputs and track zero vectors.
509-
// The `commonZeros` become fixed-points of `P`, while the 'free' zeros are
510-
// later paired with one another.
511-
DenseMap<int32_t, std::pair<StringAttr, int32_t>> valToDstInput;
512-
llvm::SmallDenseMap<StringAttr, llvm::SmallSet<int32_t, 4>> commonZeros;
513-
SmallVector<std::pair<StringAttr, int32_t>> dstFreeZeros;
514-
size_t srcFreeZerosCount = 0;
515-
516-
// We traverse the input dimensions according to their order in `src` so that
517-
// 'free' zero vectors for a given input dimension in `src` prefer to map to
518-
// 'free' zero vectors in the same dimension in `dst.
519-
for (auto inDim : srcInDims) {
520-
int inDimSize = dstFlat.getInDimSizeLog2(inDim);
521-
for (int i = 0; i < inDimSize; ++i) {
522-
int32_t dstVal = dstFlat.getBasis(inDim, i)[0];
523-
int32_t srcVal = srcFlat.getBasis(inDim, i)[0];
524-
if (dstVal == 0 && srcVal == 0) {
525-
commonZeros[inDim].insert(i);
526-
} else if (dstVal == 0) {
527-
dstFreeZeros.emplace_back(inDim, i);
528-
} else {
529-
auto [it, success] = valToDstInput.try_emplace(dstVal, inDim, i);
530-
assert(success && "Found duplicate nonzero vectors in dst layout");
531-
if (srcVal == 0)
532-
++srcFreeZerosCount;
533-
}
534-
}
535-
}
536-
assert(srcFreeZerosCount == dstFreeZeros.size() &&
537-
"src and dst layouts have differing number of zero bases");
538-
539-
// Build the basis vectors for the permutation layout `P`.
540-
// For each basis vector in `src`, determine its target in `dst`:
541-
// - If the vector is nonzero, find the corresponding vector in `dst`.
542-
// - If it is a zero vector common to both layouts, set it as a fixed-point.
543-
// - Otherwise, pair it with the next available free zero of `dst`.
544-
LinearLayout::BasesT pBases;
545-
size_t numDims = llvm::size(srcInDims);
546-
size_t freeZeroIdx = 0;
547-
for (auto inDim : srcInDims) {
548-
int inDimSize = srcFlat.getInDimSizeLog2(inDim);
549-
auto &inDimBases = pBases[inDim];
550-
inDimBases.reserve(inDimSize);
551-
for (int i = 0; i < inDimSize; ++i)
552-
inDimBases.emplace_back(numDims, 0);
553-
554-
for (int inIdx = 0; inIdx < inDimSize; ++inIdx) {
555-
int32_t val = srcFlat.getBasis(inDim, inIdx)[0];
556-
std::pair<StringAttr, int32_t> dstTarget;
557-
558-
if (val != 0) {
559-
auto it = valToDstInput.find(val);
560-
assert(it != valToDstInput.end() && "src basis not found in dst");
561-
dstTarget = it->second;
562-
} else if (commonZeros.lookup(inDim).count(inIdx)) {
563-
dstTarget = {inDim, inIdx};
564-
} else {
565-
dstTarget = dstFreeZeros[freeZeroIdx++];
566-
}
567-
568-
// Build the basis vector for `P` using the ordering on output dimensions
569-
// induced by the ordering on the input dimensions of `src`.
570-
auto it = llvm::find(srcInDims, dstTarget.first);
571-
int outDimIdx = std::distance(srcInDims.begin(), it);
572-
inDimBases[inIdx][outDimIdx] = 1 << dstTarget.second;
573-
}
574-
}
575-
// Declare the ordering on the `outDims` of `P` to be that of `srcInDims`.
576-
SmallVector<std::pair<StringAttr, int32_t>> outDims;
577-
for (auto outDim : srcInDims)
578-
outDims.emplace_back(outDim, srcFlat.getInDimSize(outDim));
579-
580-
return LinearLayout(std::move(pBases), outDims, /*requireSurjective=*/true);
581-
}
582-
583449
} // namespace mlir::triton

python/triton/knobs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import functools
34
import importlib
45
import os
56
import re
@@ -171,6 +172,7 @@ class NvidiaTool:
171172
version: str
172173

173174
@staticmethod
175+
@functools.lru_cache
174176
def from_path(path: str) -> Optional[NvidiaTool]:
175177
try:
176178
result = subprocess.check_output([path, "--version"], stderr=subprocess.STDOUT)

0 commit comments

Comments
 (0)