intel
diff --git a/‎.github/workflows/integration-tests-amd.yml
Lines changed: 24 additions & 8 deletions b/‎.github/workflows/integration-tests-amd.yml
Lines changed: 24 additions & 8 deletions
diff --git a/‎include/triton/Analysis/Utility.h
Lines changed: 0 additions & 5 deletions b/‎include/triton/Analysis/Utility.h
Lines changed: 0 additions & 5 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h renamed to ‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h renamed to ‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h
diff --git a/‎lib/Analysis/Utility.cpp
Lines changed: 1 addition & 24 deletions b/‎lib/Analysis/Utility.cpp
Lines changed: 1 addition & 24 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/Triton/IR/Ops.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp
Lines changed: 4 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp
Lines changed: 1 addition & 1 deletion
@@ -18,8 +18,25 @@ jobs:
         runner: ${{ fromJson(inputs.matrix) }}
         include:
           - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+            runner: ["self-hosted", "gfx90a"]
+            # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
+            # container expect it at /github/home/.triton. So map here to make sure visible in docker.
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --volume /home/runner/.triton:/github/home/.triton
+          - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+            runner: ["amd-gfx942"]
+            # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --env-file /etc/podinfo/gha-gpu-isolation-settings
+              --volume /home/runner/.triton:/github/home/.triton
           - image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
             runner: ["amd-gfx950"]
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --env-file /etc/podinfo/gha-gpu-isolation-settings
+              --volume /home/runner/.triton:/github/home/.triton
     env:
       RUNNER_TYPE: ${{ matrix.runner[1] }}
       TRITON_BUILD_WITH_CCACHE: "true"
@@ -31,11 +48,7 @@ jobs:
       CCACHE_COMPRESS: "true"
     container:
       image: ${{ matrix.image }}
-      # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
-      # container expect it at /github/home/.triton. So map here to make sure visible in docker.
-      options: >-
-        --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
-        --volume /home/runner/.triton:/github/home/.triton
+      options: ${{ matrix.options }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -96,6 +109,8 @@ jobs:
         run: ccache --print-stats
       - name: Run lit tests
         run: make test-lit
+      - name: Run C++ unittests
+        run: make test-cpp
       - name: Run python tests on AMD
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -147,13 +162,13 @@ jobs:
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
+          unset HIP_VISIBLE_DEVICES
+          unset ROCR_VISIBLE_DEVICES
           if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
             python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
           else
             make test-proton
           fi
-      - name: Run C++ unittests
-        run: make test-cpp
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
@@ -162,7 +177,8 @@ jobs:
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
       - name: Clean up caches
-        # Always cleanup the worker, even if builds or tests failed
+        # Always cleanup the worker, even if builds or tests failed given that these directories are
+        # mapped from the host and we write files as the root user in the docker.
         if: always()
         run: |
           rm -rf ~/.triton/cache
 
@@ -252,11 +252,6 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy);
 // warps, and possibly blocks.
 bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
-// Check if MFMA layout can be converted to the dot operand
-// layout using warp shuffle.
-bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
-                                       RankedTensorType dstTy);
-
 // TODO: Move utility functions that belong to ConvertLayoutOp to class
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
 
@@ -1275,7 +1275,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
   let arguments = (ins Variadic<AnyType>:$srcs);
 
   let builders = [OpBuilder<(ins), [{
-    build($_builder, $_state, std::nullopt);
+    build($_builder, $_state, mlir::ValueRange());
   }]>];
 
   let assemblyFormat = "attr-dict ($srcs^ `:` type($srcs))?";
 
@@ -719,24 +719,6 @@ bool supportMMA(Value value, int version) {
          (elemTy.isInteger(8) && version >= 2);
 }
 
-bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
-                                       RankedTensorType dstTy) {
-  auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcTy.getEncoding());
-  auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
-  if (!mfmaLayout || !dotOperandLayout)
-    return false;
-
-  // Currently supporting 32x32 and 16x16 FP8 MFMA -> dot operand case
-  return dotOperandLayout.getParent() == mfmaLayout &&
-         dotOperandLayout.getOpIdx() == 0 && mfmaLayout.getIsTransposed() &&
-         dotOperandLayout.getKWidth() == 8 &&
-         ((mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16) ||
-          (mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32)) &&
-         triton::type::isFloat8(srcTy.getElementType()) &&
-         triton::type::isFloat8(dstTy.getElementType()) &&
-         mfmaLayout.getWarpsPerCTA()[1] == 1;
-}
-
 // We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
 // under the common dimensions. The idea here is that if we have a
 // transformation that's the identity on kBlock, we don't need to use
@@ -794,14 +776,9 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
 }
 
 bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
-  // TODO(jlebar): Remove these special cases `isMfmaToDotShortcut` once
-  // they're fully subsumed by the linear-layout checks.
   return !cvtReordersRegisters(srcTy, dstTy) &&
          !cvtNeedsWarpShuffle(srcTy, dstTy) &&
-         !triton::gpu::intel::isDpasToDotShortcut(srcTy, dstTy) &&
-         // to be removed when generalized warp shuffle conversions
-         // are ready:
-         !matchMFMAAndDotOperandShuffleCase(srcTy, dstTy);
+         !triton::gpu::intel::isDpasToDotShortcut(srcTy, dstTy);
 }
 
 namespace {
 
@@ -1052,7 +1052,7 @@ void FuncOp::build(OpBuilder &builder, OperationState &state, StringRef name,
     return;
   assert(type.getNumInputs() == argAttrs.size());
   call_interface_impl::addArgAndResultAttrs(
-      builder, state, argAttrs, /*resultAttrs=*/std::nullopt,
+      builder, state, argAttrs, /*resultAttrs=*/{},
       getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));
 }
 
 
@@ -105,9 +105,10 @@ struct ClipAsyncCopySizePerThread
   }
 };
 
-class CoalesceAsyncCopyPass
-    : public impl::TritonGPUCoalesceAsyncCopyBase<CoalesceAsyncCopyPass> {
-public:
+struct CoalesceAsyncCopyPass
+    : impl::TritonGPUCoalesceAsyncCopyBase<CoalesceAsyncCopyPass> {
+  using Base::Base;
+
   void runOnOperation() override {
     ModuleOp m = getOperation();
     MLIRContext *context = &getContext();
 
@@ -1,4 +1,3 @@
-#include "PartitionBuilder.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dominance.h"
@@ -9,6 +8,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 
@@ -1,4 +1,4 @@
-#include "PartitionBuilder.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 
 
@@ -1,10 +1,10 @@
-#include "PartitionBuilder.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Partition.h"
+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
Original file line number	Diff line number	Diff line change
`@@ -1052,7 +1052,7 @@ void FuncOp::build(OpBuilder &builder, OperationState &state, StringRef name,`
`1052`	`1052`	`return;`
`1053`	`1053`	`assert(type.getNumInputs() == argAttrs.size());`
`1054`	`1054`	`call_interface_impl::addArgAndResultAttrs(`
`1055`		`- builder, state, argAttrs, /resultAttrs=/std::nullopt,`
	`1055`	`+ builder, state, argAttrs, /resultAttrs=/{},`
`1056`	`1056`	`getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));`
`1057`	`1057`	`}`
`1058`	`1058`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "PartitionBuilder.h"`
	`1`	`+#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"`
`2`	`2`	`#include "triton/Dialect/TritonGPU/Transforms/Partition.h"`
`3`	`3`	`#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"`
`4`	`4`