Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -239,14 +239,14 @@ jobs:
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
- name: Run python tests on CUDA
run: |
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
fi
cd python/test/unit
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
Expand All @@ -268,14 +268,16 @@ jobs:
language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
runtime/test_autotuner.py::test_kwargs[False]\
../../tutorials/06-fused-attention.py::test_op --device cpu
- name: Run regression tests
run: |
cd python/test/regression
python3 -m pytest -s -n 8 .
- name: Run C++ unittests
run: |
cd python
cd "build/$(ls build | grep -i cmake)"
ctest -j32
- name: Run Proton tests
env:
LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
run: |
cd third_party/proton
python3 -m pytest -s test
Expand Down Expand Up @@ -395,14 +397,14 @@ jobs:
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
- name: Run python tests on HIP
run: |
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
fi
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
cd python/test/unit
Expand All @@ -416,10 +418,15 @@ jobs:

# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
- name: Run regression tests
run: |
# Reenable test_functional_regression.py once it's fixed
cd python/test/regression
python3 -m pytest -s -n 8 ./test_cast_matmul.py
- name: Run Proton tests
run: |
cd third_party/proton
python3 -m pytest test
python3 -m pytest -s test
- name: Run C++ unittests
run: |
cd python
Expand Down
24 changes: 15 additions & 9 deletions .github/workflows/integration-tests.yml.in
Original file line number Diff line number Diff line change
Expand Up @@ -272,15 +272,15 @@ jobs:
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"

- name: Run python tests on CUDA
run: |
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
fi
cd python/test/unit
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
Expand All @@ -304,16 +304,20 @@ jobs:
runtime/test_autotuner.py::test_kwargs[False]\
../../tutorials/06-fused-attention.py::test_op --device cpu

- name: Run regression tests
run: |
cd python/test/regression
python3 -m pytest -s -n 8 .

- &run-cpp-unittests-step
name: Run C++ unittests
run: |
cd python
cd "build/$(ls build | grep -i cmake)"
ctest -j32

- name: Run Proton tests
env:
LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
- &run-proton-tests-step
name: Run Proton tests
run: |
cd third_party/proton
python3 -m pytest -s test
Expand Down Expand Up @@ -398,7 +402,7 @@ jobs:
run: |
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
fi
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
cd python/test/unit
Expand All @@ -413,11 +417,13 @@ jobs:
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py

- name: Run Proton tests
- name: Run regression tests
run: |
cd third_party/proton
python3 -m pytest test
# Reenable test_functional_regression.py once it's fixed
cd python/test/regression
python3 -m pytest -s -n 8 ./test_cast_matmul.py

- *run-proton-tests-step
- *run-cpp-unittests-step
- *save-build-artifacts-step
- *inspect-cache-directories-step
Expand Down
1 change: 0 additions & 1 deletion bin/triton-lsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,5 @@ int main(int argc, char **argv) {
mlir::DialectRegistry registry;
registerTritonDialects(registry);

mlir::MLIRContext context(registry);
return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
}
3 changes: 3 additions & 0 deletions docs/meetups/dev_conference_2024.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link)

The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz).
2 changes: 2 additions & 0 deletions include/triton/Dialect/Triton/IR/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ Type getI32SameShape(Type type);

Type getPointerTypeSameShape(Type type);

Type getPointerTypeToElement(Type type);

} // namespace triton

} // namespace mlir
Expand Down
46 changes: 44 additions & 2 deletions lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,20 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
RankedTensorType dstTy = op.getType();
Attribute srcLayout = srcTy.getEncoding();
Attribute dstLayout = dstTy.getEncoding();
// FIXME [Dot LL]
// Do for all DotOperandEncodingAttr once we have LLs for all of them
auto isAmpereLargeKWidth = [](Attribute layout) {
if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
return mma.isAmpere() && dot.getKWidth() == 8;
}
}
return false;
};
if (isa<SharedEncodingAttr>(srcLayout) &&
isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
dstLayout)) {
(isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
dstLayout) ||
isAmpereLargeKWidth(dstLayout))) {
return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
rewriter);
}
Expand Down Expand Up @@ -170,6 +181,37 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
SmallVector<Value> outVals = loadSharedToDistributed(
dstTy, srcTy, elemLlvmTy, smemObj, loc, rewriter, targetInfo);

// FIXME [Dot LL]
// Ampere case
// In this case, we need to pack the outputs into i32
if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
if (elemLlvmTy.isInteger(8)) {
auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
return or_(or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
or_(shl(zext(i32_ty, a3), i32_val(16)),
shl(zext(i32_ty, a4), i32_val(24))));
};
SmallVector<Value> outVals32(outVals.size() / 4);
for (int i = 0; i < outVals32.size(); ++i) {
outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
outVals[4 * i + 2], outVals[4 * i + 3]);
}
outVals = outVals32;
} else {
assert(elemLlvmTy.isBF16() && "Unexpected element type");
auto concat = [&](Value a, Value b) {
return or_(zext(i32_ty, bitcast(a, i16_ty)),
shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
};

SmallVector<Value> outVals32(outVals.size() / 2);
for (int i = 0; i < outVals32.size(); ++i) {
outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
}
outVals = outVals32;
}
}

Value result = packLLElements(loc, typeConverter, outVals, rewriter, dstTy);
rewriter.replaceOp(op, result);

Expand Down
7 changes: 7 additions & 0 deletions lib/Dialect/Triton/IR/Types.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "triton/Dialect/Triton/IR/Types.h"

#include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc`
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Support/LLVM.h"
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc`
Expand Down Expand Up @@ -157,6 +158,12 @@ Type getPointerTypeSameShape(Type type) {
}
}

Type getPointerTypeToElement(Type type) {
Type elementType = getElementTypeOrSelf(type);
PointerType ptrType = PointerType::get(elementType, 1);
return ptrType;
}

// upstream Triton only uses address space 1 for Pointer Type
Type getPointerType(Type type, int addressSpace) {
return PointerType::get(type, addressSpace);
Expand Down
95 changes: 3 additions & 92 deletions lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,85 +163,6 @@ void LayoutRematerialization::cleanup() {
op->erase();
}

// Look ahead to at the transitive uses and see if there is a convert to mma
// operations.
bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) {
SmallVector<Value> queue = {op->getResult(0)};
SetVector<Operation *> forwardSlice;
llvm::SmallDenseSet<Value> seen;
while (!queue.empty()) {
Value currentValue = queue.back();
queue.pop_back();
getForwardSlice(currentValue, &forwardSlice);
for (Operation *op : forwardSlice) {
// HACK: Stop propagation if the ReduceOp is using mma layout but is
// producing tensor smaller than the layout we would like to propagate.
// This is to avoid stepping into the known bug.
if (isa<mlir::triton::ReduceOp>(op)) {
auto tensorType =
dyn_cast<RankedTensorType>(op->getOperand(0).getType());
if (tensorType &&
isa<NvidiaMmaEncodingAttr>(tensorType.getEncoding())) {
auto mmaInstrShape =
cast<NvidiaMmaEncodingAttr>(encoding).getInstrShape();
if (tensorType.getShape()[tensorType.getRank() - 2] <
mmaInstrShape[0] ||
tensorType.getShape()[tensorType.getRank() - 1] <
mmaInstrShape[1]) {
return false;
}
}
}

if (auto convertOp = dyn_cast<ConvertLayoutOp>(op)) {
Attribute dstEncoding = convertOp.getType().getEncoding();
if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(dstEncoding))
return (mmaLayout.getVersionMajor() > 1) ? true
: mmaLayout == encoding;
if (isa<triton::gpu::AMDMfmaEncodingAttr,
triton::gpu::AMDWmmaEncodingAttr>(dstEncoding))
return true;
if (isa<triton::gpu::DotOperandEncodingAttr>(dstEncoding)) {
if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(encoding)) {
return mmaLayout.getVersionMajor() > 1;
} else {
assert((mlir::isa<triton::gpu::AMDMfmaEncodingAttr,
triton::gpu::AMDWmmaEncodingAttr>(encoding)));
return true;
}
}
}
bool isMMAV3 =
isa<NvidiaMmaEncodingAttr>(encoding) &&
cast<NvidiaMmaEncodingAttr>(encoding).getVersionMajor() == 3;
if (isMMAV3 && (isa<LocalAllocOp>(op) || isa<LocalStoreOp>(op)))
return true;
auto yield = dyn_cast<scf::YieldOp>(op);
if (!yield)
continue;
if (auto ifOp = dyn_cast<scf::IfOp>(yield->getParentOp())) {
for (OpOperand &operand : yield->getOpOperands()) {
Operation *def = operand.get().getDefiningOp();
if (def &&
(forwardSlice.count(def) || operand.get() == currentValue) &&
(seen.insert(operand.get()).second == true))
queue.push_back(ifOp.getResult(operand.getOperandNumber()));
}
}
auto forOp = dyn_cast<scf::ForOp>(yield.getOperation()->getParentOp());
if (!forOp)
continue;
for (OpOperand &operand : yield->getOpOperands()) {
Operation *def = operand.get().getDefiningOp();
if (def && (forwardSlice.count(def) || operand.get() == currentValue) &&
(seen.insert(operand.get()).second == true))
queue.push_back(forOp.getRegionIterArg(operand.getOperandNumber()));
}
}
}
return false;
}

// Return true if the op is an op with a layout we don't want to change. We will
// propagate the layout starting from anchor ops.
bool isLayoutAnchor(Operation *op) {
Expand All @@ -262,18 +183,8 @@ bool isLayoutAnchor(Operation *op) {
}

void LayoutPropagation::initAnchorLayout() {
auto maybeAddAnchor = [&](Value v) {
auto addAnchor = [&](Value v) {
if (auto tensorType = dyn_cast<RankedTensorType>(v.getType())) {
// Workaround, don't popagate MMA layout unless there is a convert
// back to mma further down to avoid generating reduction with MMA
// layout that may have lower performance.
// This can be improved with more aggressive backward propagation.
if (isa<MmaEncodingTrait>(tensorType.getEncoding()) &&
v.getDefiningOp() &&
!hasConvertToMMATransisitiveUse(v.getDefiningOp(),
tensorType.getEncoding())) {
return;
}
layouts.insert({v, LayoutInfo(tensorType.getEncoding())});
}
};
Expand All @@ -282,13 +193,13 @@ void LayoutPropagation::initAnchorLayout() {
// you can pass a tensor with an encoding as an arg, instead of explicitly
// calling tt.load.
for (auto arg : funcOp.getArguments()) {
maybeAddAnchor(arg);
addAnchor(arg);
}

funcOp.walk([&](Operation *op) {
if (isLayoutAnchor(op)) {
for (auto result : op->getResults()) {
maybeAddAnchor(result);
addAnchor(result);
}
}
});
Expand Down
16 changes: 13 additions & 3 deletions python/test/regression/conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
# content of conftest.py

import os
import pytest
import tempfile


def pytest_addoption(parser):
parser.addoption("--device", action="store", default='cuda')
parser.addoption("--device", action="store", default="cuda")


@pytest.fixture
def device(request):
return request.config.getoption("--device")


@pytest.fixture
def fresh_triton_cache():
with tempfile.TemporaryDirectory() as tmpdir:
try:
os.environ["TRITON_CACHE_DIR"] = tmpdir
yield tmpdir
finally:
os.environ.pop("TRITON_CACHE_DIR", None)
Loading