intel · whitneywhtsang · Oct 24, 2024 · Oct 21, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -239,14 +239,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -268,14 +268,16 @@ jobs:
            language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
       - name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
       - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -395,14 +397,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on HIP
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -416,10 +418,15 @@ jobs:
 
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
+      - name: Run regression tests
+        run: |
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
           cd third_party/proton
-          python3 -m pytest test
+          python3 -m pytest -s test
       - name: Run C++ unittests
         run: |
           cd python

diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
@@ -272,15 +272,15 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
 
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -304,16 +304,20 @@ jobs:
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
 
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
+
       - &run-cpp-unittests-step
         name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
 
-      - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+      - &run-proton-tests-step
+        name: Run Proton tests
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -398,7 +402,7 @@ jobs:
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -413,11 +417,13 @@ jobs:
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
 
-      - name: Run Proton tests
+      - name: Run regression tests
         run: |
-          cd third_party/proton
-          python3 -m pytest test
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
 
+      - *run-proton-tests-step
       - *run-cpp-unittests-step
       - *save-build-artifacts-step
       - *inspect-cache-directories-step

diff --git a/bin/triton-lsp.cpp b/bin/triton-lsp.cpp
@@ -6,6 +6,5 @@ int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
   registerTritonDialects(registry);
 
-  mlir::MLIRContext context(registry);
   return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
 }
diff --git a/docs/meetups/dev_conference_2024.md b/docs/meetups/dev_conference_2024.md
@@ -0,0 +1,3 @@
+The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link)
+
+The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz).
diff --git a/include/triton/Dialect/Triton/IR/Types.h b/include/triton/Dialect/Triton/IR/Types.h
@@ -34,6 +34,8 @@ Type getI32SameShape(Type type);
 
 Type getPointerTypeSameShape(Type type);
 
+Type getPointerTypeToElement(Type type);
+
 } // namespace triton
 
 } // namespace mlir

diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -116,9 +116,20 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
+    // FIXME [Dot LL]
+    // Do for all DotOperandEncodingAttr once we have LLs for all of them
+    auto isAmpereLargeKWidth = [](Attribute layout) {
+      if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+        if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
+          return mma.isAmpere() && dot.getKWidth() == 8;
+        }
+      }
+      return false;
+    };
     if (isa<SharedEncodingAttr>(srcLayout) &&
-        isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
-            dstLayout)) {
+        (isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
+             dstLayout) ||
+         isAmpereLargeKWidth(dstLayout))) {
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
@@ -170,6 +181,37 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     SmallVector<Value> outVals = loadSharedToDistributed(
         dstTy, srcTy, elemLlvmTy, smemObj, loc, rewriter, targetInfo);
 
+    // FIXME [Dot LL]
+    // Ampere case
+    // In this case, we need to pack the outputs into i32
+    if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
+      if (elemLlvmTy.isInteger(8)) {
+        auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
+          return or_(or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
+                     or_(shl(zext(i32_ty, a3), i32_val(16)),
+                         shl(zext(i32_ty, a4), i32_val(24))));
+        };
+        SmallVector<Value> outVals32(outVals.size() / 4);
+        for (int i = 0; i < outVals32.size(); ++i) {
+          outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
+                                outVals[4 * i + 2], outVals[4 * i + 3]);
+        }
+        outVals = outVals32;
+      } else {
+        assert(elemLlvmTy.isBF16() && "Unexpected element type");
+        auto concat = [&](Value a, Value b) {
+          return or_(zext(i32_ty, bitcast(a, i16_ty)),
+                     shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
+        };
+
+        SmallVector<Value> outVals32(outVals.size() / 2);
+        for (int i = 0; i < outVals32.size(); ++i) {
+          outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+        }
+        outVals = outVals32;
+      }
+    }
+
     Value result = packLLElements(loc, typeConverter, outVals, rewriter, dstTy);
     rewriter.replaceOp(op, result);
 

diff --git a/lib/Dialect/Triton/IR/Types.cpp b/lib/Dialect/Triton/IR/Types.cpp
@@ -1,6 +1,7 @@
 #include "triton/Dialect/Triton/IR/Types.h"
 
 #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc`
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc`
@@ -157,6 +158,12 @@ Type getPointerTypeSameShape(Type type) {
   }
 }
 
+Type getPointerTypeToElement(Type type) {
+  Type elementType = getElementTypeOrSelf(type);
+  PointerType ptrType = PointerType::get(elementType, 1);
+  return ptrType;
+}
+
 // upstream Triton only uses address space 1 for Pointer Type
 Type getPointerType(Type type, int addressSpace) {
   return PointerType::get(type, addressSpace);

diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
@@ -163,85 +163,6 @@ void LayoutRematerialization::cleanup() {
     op->erase();
 }
 
-// Look ahead to at the transitive uses and see if there is a convert to mma
-// operations.
-bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) {
-  SmallVector<Value> queue = {op->getResult(0)};
-  SetVector<Operation *> forwardSlice;
-  llvm::SmallDenseSet<Value> seen;
-  while (!queue.empty()) {
-    Value currentValue = queue.back();
-    queue.pop_back();
-    getForwardSlice(currentValue, &forwardSlice);
-    for (Operation *op : forwardSlice) {
-      // HACK: Stop propagation if the ReduceOp is using mma layout but is
-      // producing tensor smaller than the layout we would like to propagate.
-      // This is to avoid stepping into the known bug.
-      if (isa<mlir::triton::ReduceOp>(op)) {
-        auto tensorType =
-            dyn_cast<RankedTensorType>(op->getOperand(0).getType());
-        if (tensorType &&
-            isa<NvidiaMmaEncodingAttr>(tensorType.getEncoding())) {
-          auto mmaInstrShape =
-              cast<NvidiaMmaEncodingAttr>(encoding).getInstrShape();
-          if (tensorType.getShape()[tensorType.getRank() - 2] <
-                  mmaInstrShape[0] ||
-              tensorType.getShape()[tensorType.getRank() - 1] <
-                  mmaInstrShape[1]) {
-            return false;
-          }
-        }
-      }
-
-      if (auto convertOp = dyn_cast<ConvertLayoutOp>(op)) {
-        Attribute dstEncoding = convertOp.getType().getEncoding();
-        if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(dstEncoding))
-          return (mmaLayout.getVersionMajor() > 1) ? true
-                                                   : mmaLayout == encoding;
-        if (isa<triton::gpu::AMDMfmaEncodingAttr,
-                triton::gpu::AMDWmmaEncodingAttr>(dstEncoding))
-          return true;
-        if (isa<triton::gpu::DotOperandEncodingAttr>(dstEncoding)) {
-          if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(encoding)) {
-            return mmaLayout.getVersionMajor() > 1;
-          } else {
-            assert((mlir::isa<triton::gpu::AMDMfmaEncodingAttr,
-                              triton::gpu::AMDWmmaEncodingAttr>(encoding)));
-            return true;
-          }
-        }
-      }
-      bool isMMAV3 =
-          isa<NvidiaMmaEncodingAttr>(encoding) &&
-          cast<NvidiaMmaEncodingAttr>(encoding).getVersionMajor() == 3;
-      if (isMMAV3 && (isa<LocalAllocOp>(op) || isa<LocalStoreOp>(op)))
-        return true;
-      auto yield = dyn_cast<scf::YieldOp>(op);
-      if (!yield)
-        continue;
-      if (auto ifOp = dyn_cast<scf::IfOp>(yield->getParentOp())) {
-        for (OpOperand &operand : yield->getOpOperands()) {
-          Operation *def = operand.get().getDefiningOp();
-          if (def &&
-              (forwardSlice.count(def) || operand.get() == currentValue) &&
-              (seen.insert(operand.get()).second == true))
-            queue.push_back(ifOp.getResult(operand.getOperandNumber()));
-        }
-      }
-      auto forOp = dyn_cast<scf::ForOp>(yield.getOperation()->getParentOp());
-      if (!forOp)
-        continue;
-      for (OpOperand &operand : yield->getOpOperands()) {
-        Operation *def = operand.get().getDefiningOp();
-        if (def && (forwardSlice.count(def) || operand.get() == currentValue) &&
-            (seen.insert(operand.get()).second == true))
-          queue.push_back(forOp.getRegionIterArg(operand.getOperandNumber()));
-      }
-    }
-  }
-  return false;
-}
-
 // Return true if the op is an op with a layout we don't want to change. We will
 // propagate the layout starting from anchor ops.
 bool isLayoutAnchor(Operation *op) {
@@ -262,18 +183,8 @@ bool isLayoutAnchor(Operation *op) {
 }
 
 void LayoutPropagation::initAnchorLayout() {
-  auto maybeAddAnchor = [&](Value v) {
+  auto addAnchor = [&](Value v) {
     if (auto tensorType = dyn_cast<RankedTensorType>(v.getType())) {
-      // Workaround, don't popagate MMA layout unless there is a convert
-      // back to mma further down to avoid generating reduction with MMA
-      // layout that may have lower performance.
-      // This can be improved with more aggressive backward propagation.
-      if (isa<MmaEncodingTrait>(tensorType.getEncoding()) &&
-          v.getDefiningOp() &&
-          !hasConvertToMMATransisitiveUse(v.getDefiningOp(),
-                                          tensorType.getEncoding())) {
-        return;
-      }
       layouts.insert({v, LayoutInfo(tensorType.getEncoding())});
     }
   };
@@ -282,13 +193,13 @@ void LayoutPropagation::initAnchorLayout() {
   // you can pass a tensor with an encoding as an arg, instead of explicitly
   // calling tt.load.
   for (auto arg : funcOp.getArguments()) {
-    maybeAddAnchor(arg);
+    addAnchor(arg);
   }
 
   funcOp.walk([&](Operation *op) {
     if (isLayoutAnchor(op)) {
       for (auto result : op->getResults()) {
-        maybeAddAnchor(result);
+        addAnchor(result);
       }
     }
   });

diff --git a/python/test/regression/conftest.py b/python/test/regression/conftest.py
@@ -1,12 +1,22 @@
-# content of conftest.py
-
+import os
 import pytest
+import tempfile
 
 
 def pytest_addoption(parser):
-    parser.addoption("--device", action="store", default='cuda')
+    parser.addoption("--device", action="store", default="cuda")
 
 
 @pytest.fixture
 def device(request):
     return request.config.getoption("--device")
+
+
+@pytest.fixture
+def fresh_triton_cache():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            os.environ["TRITON_CACHE_DIR"] = tmpdir
+            yield tmpdir
+        finally:
+            os.environ.pop("TRITON_CACHE_DIR", None)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link)

		The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz).