intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 24 additions & 8 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 7 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 0 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h‎ renamed to ‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h‎ b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h‎ renamed to ‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h‎
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentAttrDefs.td‎
Lines changed: 15 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentAttrDefs.td‎
Lines changed: 15 additions & 0 deletions
@@ -18,8 +18,25 @@ jobs:
         runner: ${{ fromJson(inputs.matrix) }}
         include:
           - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+            runner: ["self-hosted", "gfx90a"]
+            # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
+            # container expect it at /github/home/.triton. So map here to make sure visible in docker.
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --volume /home/runner/.triton:/github/home/.triton
+          - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+            runner: ["amd-gfx942"]
+            # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --env-file /etc/podinfo/gha-gpu-isolation-settings
+              --volume /home/runner/.triton:/github/home/.triton
           - image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
             runner: ["amd-gfx950"]
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --env-file /etc/podinfo/gha-gpu-isolation-settings
+              --volume /home/runner/.triton:/github/home/.triton
     env:
       RUNNER_TYPE: ${{ matrix.runner[1] }}
       TRITON_BUILD_WITH_CCACHE: "true"
@@ -31,11 +48,7 @@ jobs:
       CCACHE_COMPRESS: "true"
     container:
       image: ${{ matrix.image }}
-      # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
-      # container expect it at /github/home/.triton. So map here to make sure visible in docker.
-      options: >-
-        --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
-        --volume /home/runner/.triton:/github/home/.triton
+      options: ${{ matrix.options }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -96,6 +109,8 @@ jobs:
         run: ccache --print-stats
       - name: Run lit tests
         run: make test-lit
+      - name: Run C++ unittests
+        run: make test-cpp
       - name: Run python tests on AMD
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -147,13 +162,13 @@ jobs:
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
+          unset HIP_VISIBLE_DEVICES
+          unset ROCR_VISIBLE_DEVICES
           if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
             python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
           else
             make test-proton
           fi
-      - name: Run C++ unittests
-        run: make test-cpp
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
@@ -162,7 +177,8 @@ jobs:
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
       - name: Clean up caches
-        # Always cleanup the worker, even if builds or tests failed
+        # Always cleanup the worker, even if builds or tests failed given that these directories are
+        # mapped from the host and we write files as the root user in the docker.
         if: always()
         run: |
           rm -rf ~/.triton/cache
 
@@ -6,7 +6,7 @@ PYTHON ?= python
 BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
 TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
 PYTEST := $(PYTHON) -m pytest
-LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
+LLVM_BUILD_PATH ?= "$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/.llvm-project/build"
 NUM_PROCS ?= 8
 
 # Incremental builds
 
@@ -252,13 +252,6 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy);
 // warps, and possibly blocks.
 bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
-bool atomicNeedsSharedMemory(Value result);
-
-// Check if MFMA layout can be converted to the dot operand
-// layout using warp shuffle.
-bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
-                                       RankedTensorType dstTy);
-
 // TODO: Move utility functions that belong to ConvertLayoutOp to class
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
 
@@ -655,6 +655,13 @@ SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
                           mlir::TypeID::get<TerminatorOp>(), loc);
 }
 
+void finalizeTensorAtomicResults(Operation *op, RankedTensorType tensorTy,
+                                 ConversionPatternRewriter &rewriter,
+                                 SmallVector<Value> &resultVals,
+                                 Type valueElemTy, TritonLLVMOpBuilder &b,
+                                 Value threadPred,
+                                 const TargetInfoBase &targetInfo,
+                                 const LLVMTypeConverter *typeConverter);
 } // namespace mlir
 
 #endif
@@ -1275,7 +1275,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
   let arguments = (ins Variadic<AnyType>:$srcs);
 
   let builders = [OpBuilder<(ins), [{
-    build($_builder, $_state, std::nullopt);
+    build($_builder, $_state, mlir::ValueRange());
   }]>];
 
   let assemblyFormat = "attr-dict ($srcs^ `:` type($srcs))?";
 
@@ -214,8 +214,6 @@ def TTG_MemDescIndexOp : TTG_Op<"memdesc_index", [Pure, MemDescViewTrait]> {
      - the output shape is 4x16xf16, and
      - index = 1.
     Then the output descriptor is equivalent to input[1], where input is the logical tensor.
-
-    When the input is of rank 1 (i.e, shape=[k]), the output will have shape=[1].
   }];
 
   let arguments = (ins TTG_MemDescType:$src, I32:$index);
 
@@ -8,6 +8,8 @@ add_mlir_doc(TritonInstrumentDialect TritonInstrumentDialect dialects/ -gen-dial
 set(LLVM_TARGET_DEFINITIONS TritonInstrumentOps.td)
 mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 add_mlir_doc(TritonInstrumentOps TritonInstrumentOps dialects/ -gen-op-doc)
 
 add_public_tablegen_target(TritonInstrumentTableGen)
@@ -5,6 +5,8 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
+#include "triton/Dialect/TritonInstrument/IR/OpsEnums.h.inc"
+
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonInstrument/IR/Dialect.h.inc"
 #include "triton/Dialect/TritonInstrument/IR/Ops.h.inc"
 
@@ -0,0 +1,15 @@
+#ifndef TRITONINSTRUMENT_ATTR_DEFS
+#define TRITONINSTRUMENT_ATTR_DEFS
+
+include "mlir/IR/EnumAttr.td"
+
+def TT_MemTypeAttr : I32EnumAttr<
+    "MemType", "",
+    [
+        I32EnumAttrCase<"SHARED", 0, "shared">,
+        I32EnumAttrCase<"TENSOR", 1, "tensor">,
+    ]> {
+    let cppNamespace = "::mlir::triton::instrument";
+}
+
+#endif // TRITONINSTRUMENT_ATTR_DEFS