intel
diff --git a/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/auto-update-translator-cid.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/auto-update-translator-cid.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 14 additions & 7 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 15 additions & 9 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bin/triton-lsp.cpp‎
Lines changed: 0 additions & 1 deletion b/‎bin/triton-lsp.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/meetups/dev_conference_2024.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/meetups/dev_conference_2024.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Types.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Types.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 44 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 44 additions & 2 deletions
diff --git a/‎lib/Dialect/Triton/IR/Types.cpp‎
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/Triton/IR/Types.cpp‎
Lines changed: 7 additions & 0 deletions
@@ -1 +1 @@
-487873f7cafeb0fd390eaefe40496b804bceabbd
+0efa590d435d2b4aefcbad9014dd5fa75dcf8405
@@ -86,7 +86,6 @@ jobs:
       - name: Search the latest valid Translator cid
         if: ${{ env.TARGET_PRID == null }}
         run: |
-          env
           ./scripts/check-update-translator-cid.sh $CID_LATEST $CID_CURRENT
           if git status --porcelain ./lib/Target/SPIRV/spirv-llvm-translator.conf | grep '^ M'; then
             echo "MODIFIED=true" >> $GITHUB_ENV
 
@@ -239,14 +239,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -268,14 +268,16 @@ jobs:
            language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
       - name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
       - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -395,14 +397,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on HIP
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -416,10 +418,15 @@ jobs:
 
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
+      - name: Run regression tests
+        run: |
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
           cd third_party/proton
-          python3 -m pytest test
+          python3 -m pytest -s test
       - name: Run C++ unittests
         run: |
           cd python
 
@@ -272,15 +272,15 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
 
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -304,16 +304,20 @@ jobs:
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
 
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
+
       - &run-cpp-unittests-step
         name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
 
-      - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+      - &run-proton-tests-step
+        name: Run Proton tests
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -398,7 +402,7 @@ jobs:
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -413,11 +417,13 @@ jobs:
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
 
-      - name: Run Proton tests
+      - name: Run regression tests
         run: |
-          cd third_party/proton
-          python3 -m pytest test
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
 
+      - *run-proton-tests-step
       - *run-cpp-unittests-step
       - *save-build-artifacts-step
       - *inspect-cache-directories-step
 
@@ -405,6 +405,7 @@ def serialize_kernel_metadata(arg, args_dict):
     args_dict["shared_memory"] = arg.shared
     args_dict["kernel_name"] = arg.name
     args_dict["spv_name"] = f"{arg.name}.spv"
+    args_dict["build_flags"] = arg.build_flags
 
 
 def serialize_args(args, constants, signature):
 
@@ -6,6 +6,5 @@ int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
   registerTritonDialects(registry);
 
-  mlir::MLIRContext context(registry);
   return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));
 }
@@ -0,0 +1,3 @@
+The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link)
+
+The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz).
@@ -34,6 +34,8 @@ Type getI32SameShape(Type type);
 
 Type getPointerTypeSameShape(Type type);
 
+Type getPointerTypeToElement(Type type);
+
 } // namespace triton
 
 } // namespace mlir
 
@@ -116,9 +116,20 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
+    // FIXME [Dot LL]
+    // Do for all DotOperandEncodingAttr once we have LLs for all of them
+    auto isAmpereLargeKWidth = [](Attribute layout) {
+      if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+        if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
+          return mma.isAmpere() && dot.getKWidth() == 8;
+        }
+      }
+      return false;
+    };
     if (isa<SharedEncodingAttr>(srcLayout) &&
-        isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
-            dstLayout)) {
+        (isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
+             dstLayout) ||
+         isAmpereLargeKWidth(dstLayout))) {
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
@@ -170,6 +181,37 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     SmallVector<Value> outVals = loadSharedToDistributed(
         dstTy, srcTy, elemLlvmTy, smemObj, loc, rewriter, targetInfo);
 
+    // FIXME [Dot LL]
+    // Ampere case
+    // In this case, we need to pack the outputs into i32
+    if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
+      if (elemLlvmTy.isInteger(8)) {
+        auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
+          return or_(or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
+                     or_(shl(zext(i32_ty, a3), i32_val(16)),
+                         shl(zext(i32_ty, a4), i32_val(24))));
+        };
+        SmallVector<Value> outVals32(outVals.size() / 4);
+        for (int i = 0; i < outVals32.size(); ++i) {
+          outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
+                                outVals[4 * i + 2], outVals[4 * i + 3]);
+        }
+        outVals = outVals32;
+      } else {
+        assert(elemLlvmTy.isBF16() && "Unexpected element type");
+        auto concat = [&](Value a, Value b) {
+          return or_(zext(i32_ty, bitcast(a, i16_ty)),
+                     shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
+        };
+
+        SmallVector<Value> outVals32(outVals.size() / 2);
+        for (int i = 0; i < outVals32.size(); ++i) {
+          outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+        }
+        outVals = outVals32;
+      }
+    }
+
     Value result = packLLElements(loc, typeConverter, outVals, rewriter, dstTy);
     rewriter.replaceOp(op, result);
 
 
@@ -1,6 +1,7 @@
 #include "triton/Dialect/Triton/IR/Types.h"
 
 #include "mlir/IR/DialectImplementation.h" // required by `Types.cpp.inc`
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "llvm/ADT/TypeSwitch.h" // required by `Types.cpp.inc`
@@ -157,6 +158,12 @@ Type getPointerTypeSameShape(Type type) {
   }
 }
 
+Type getPointerTypeToElement(Type type) {
+  Type elementType = getElementTypeOrSelf(type);
+  PointerType ptrType = PointerType::get(elementType, 1);
+  return ptrType;
+}
+
 // upstream Triton only uses address space 1 for Pointer Type
 Type getPointerType(Type type, int addressSpace) {
   return PointerType::get(type, addressSpace);
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-487873f7cafeb0fd390eaefe40496b804bceabbd`
	`1`	`+0efa590d435d2b4aefcbad9014dd5fa75dcf8405`
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,5 @@ int main(int argc, char **argv) {`
`6`	`6`	`mlir::DialectRegistry registry;`
`7`	`7`	`registerTritonDialects(registry);`
`8`	`8`
`9`		`- mlir::MLIRContext context(registry);`
`10`	`9`	`return mlir::failed(mlir::MlirLspServerMain(argc, argv, registry));`
`11`	`10`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+The conference slides are available [here](https://drive.google.com/drive/folders/1osK9hwcX_lC1EjdZGB-v4w5oKx23UnU2?usp=drive_link)`
	`2`	`+`
	`3`	`+The conference videos are available [here](https://www.youtube.com/playlist?list=PLc_vA1r0qoiTjlrINKUuFrI8Ptoopm8Vz).`