Skip to content

Commit bc82b95

Browse files
Merge commit '9e3886ffc134f60a6cd679c1a8645c752e722615'
2 parents f81817b + 9e3886f commit bc82b95

File tree

38 files changed

+984
-610
lines changed

38 files changed

+984
-610
lines changed

.github/workflows/integration-tests-amd.yml

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,25 @@ jobs:
1818
runner: ${{ fromJson(inputs.matrix) }}
1919
include:
2020
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
21+
runner: ["self-hosted", "gfx90a"]
22+
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
23+
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
24+
options: >-
25+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
26+
--volume /home/runner/.triton:/github/home/.triton
27+
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
28+
runner: ["amd-gfx942"]
29+
# We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
30+
options: >-
31+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
32+
--env-file /etc/podinfo/gha-gpu-isolation-settings
33+
--volume /home/runner/.triton:/github/home/.triton
2134
- image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
2235
runner: ["amd-gfx950"]
36+
options: >-
37+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
38+
--env-file /etc/podinfo/gha-gpu-isolation-settings
39+
--volume /home/runner/.triton:/github/home/.triton
2340
env:
2441
RUNNER_TYPE: ${{ matrix.runner[1] }}
2542
TRITON_BUILD_WITH_CCACHE: "true"
@@ -31,11 +48,7 @@ jobs:
3148
CCACHE_COMPRESS: "true"
3249
container:
3350
image: ${{ matrix.image }}
34-
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
35-
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
36-
options: >-
37-
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
38-
--volume /home/runner/.triton:/github/home/.triton
51+
options: ${{ matrix.options }}
3952
steps:
4053
- name: Checkout
4154
uses: actions/checkout@v4
@@ -96,6 +109,8 @@ jobs:
96109
run: ccache --print-stats
97110
- name: Run lit tests
98111
run: make test-lit
112+
- name: Run C++ unittests
113+
run: make test-cpp
99114
- name: Run python tests on AMD
100115
run: |
101116
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -147,13 +162,13 @@ jobs:
147162
python3 -m pytest -s -n 8 ./test_cast_matmul.py
148163
- name: Run Proton tests
149164
run: |
165+
unset HIP_VISIBLE_DEVICES
166+
unset ROCR_VISIBLE_DEVICES
150167
if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
151168
python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
152169
else
153170
make test-proton
154171
fi
155-
- name: Run C++ unittests
156-
run: make test-cpp
157172
- name: Inspect cache directories
158173
run: |
159174
mkdir -p ~/.triton
@@ -162,7 +177,8 @@ jobs:
162177
mkdir -p ~/.ccache
163178
du -h -d 1 ~/.ccache
164179
- name: Clean up caches
165-
# Always cleanup the worker, even if builds or tests failed
180+
# Always cleanup the worker, even if builds or tests failed given that these directories are
181+
# mapped from the host and we write files as the root user in the docker.
166182
if: always()
167183
run: |
168184
rm -rf ~/.triton/cache

include/triton/Analysis/Utility.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -252,11 +252,6 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy);
252252
// warps, and possibly blocks.
253253
bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
254254

255-
// Check if MFMA layout can be converted to the dot operand
256-
// layout using warp shuffle.
257-
bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
258-
RankedTensorType dstTy);
259-
260255
// TODO: Move utility functions that belong to ConvertLayoutOp to class
261256
// ConvertLayoutOpHelper in the future
262257
bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1275,7 +1275,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
12751275
let arguments = (ins Variadic<AnyType>:$srcs);
12761276

12771277
let builders = [OpBuilder<(ins), [{
1278-
build($_builder, $_state, std::nullopt);
1278+
build($_builder, $_state, mlir::ValueRange());
12791279
}]>];
12801280

12811281
let assemblyFormat = "attr-dict ($srcs^ `:` type($srcs))?";

lib/Analysis/Utility.cpp

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -719,24 +719,6 @@ bool supportMMA(Value value, int version) {
719719
(elemTy.isInteger(8) && version >= 2);
720720
}
721721

722-
bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
723-
RankedTensorType dstTy) {
724-
auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcTy.getEncoding());
725-
auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
726-
if (!mfmaLayout || !dotOperandLayout)
727-
return false;
728-
729-
// Currently supporting 32x32 and 16x16 FP8 MFMA -> dot operand case
730-
return dotOperandLayout.getParent() == mfmaLayout &&
731-
dotOperandLayout.getOpIdx() == 0 && mfmaLayout.getIsTransposed() &&
732-
dotOperandLayout.getKWidth() == 8 &&
733-
((mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16) ||
734-
(mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32)) &&
735-
triton::type::isFloat8(srcTy.getElementType()) &&
736-
triton::type::isFloat8(dstTy.getElementType()) &&
737-
mfmaLayout.getWarpsPerCTA()[1] == 1;
738-
}
739-
740722
// We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
741723
// under the common dimensions. The idea here is that if we have a
742724
// transformation that's the identity on kBlock, we don't need to use
@@ -794,14 +776,9 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy) {
794776
}
795777

796778
bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
797-
// TODO(jlebar): Remove these special cases `isMfmaToDotShortcut` once
798-
// they're fully subsumed by the linear-layout checks.
799779
return !cvtReordersRegisters(srcTy, dstTy) &&
800780
!cvtNeedsWarpShuffle(srcTy, dstTy) &&
801-
!triton::gpu::intel::isDpasToDotShortcut(srcTy, dstTy) &&
802-
// to be removed when generalized warp shuffle conversions
803-
// are ready:
804-
!matchMFMAAndDotOperandShuffleCase(srcTy, dstTy);
781+
!triton::gpu::intel::isDpasToDotShortcut(srcTy, dstTy);
805782
}
806783

807784
namespace {

lib/Dialect/Triton/IR/Ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1052,7 +1052,7 @@ void FuncOp::build(OpBuilder &builder, OperationState &state, StringRef name,
10521052
return;
10531053
assert(type.getNumInputs() == argAttrs.size());
10541054
call_interface_impl::addArgAndResultAttrs(
1055-
builder, state, argAttrs, /*resultAttrs=*/std::nullopt,
1055+
builder, state, argAttrs, /*resultAttrs=*/{},
10561056
getArgAttrsAttrName(state.name), getResAttrsAttrName(state.name));
10571057
}
10581058

lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,10 @@ struct ClipAsyncCopySizePerThread
105105
}
106106
};
107107

108-
class CoalesceAsyncCopyPass
109-
: public impl::TritonGPUCoalesceAsyncCopyBase<CoalesceAsyncCopyPass> {
110-
public:
108+
struct CoalesceAsyncCopyPass
109+
: impl::TritonGPUCoalesceAsyncCopyBase<CoalesceAsyncCopyPass> {
110+
using Base::Base;
111+
111112
void runOnOperation() override {
112113
ModuleOp m = getOperation();
113114
MLIRContext *context = &getContext();

lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#include "PartitionBuilder.h"
21
#include "mlir/Dialect/UB/IR/UBOps.h"
32
#include "mlir/IR/BuiltinOps.h"
43
#include "mlir/IR/Dominance.h"
@@ -9,6 +8,7 @@
98
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
109
#include "triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h"
1110
#include "triton/Dialect/TritonGPU/Transforms/Partition.h"
11+
#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
1212
#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
1313
#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
1414
#include "triton/Dialect/TritonGPU/Transforms/Utility.h"

lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "PartitionBuilder.h"
1+
#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
22
#include "triton/Dialect/TritonGPU/Transforms/Partition.h"
33
#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
44

lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
#include "PartitionBuilder.h"
21
#include "mlir/Dialect/SCF/IR/SCF.h"
32
#include "mlir/IR/BuiltinOps.h"
43
#include "mlir/Pass/Pass.h"
54
#include "nvidia/include/Dialect/NVWS/IR/Dialect.h"
65
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
76
#include "triton/Dialect/TritonGPU/Transforms/Partition.h"
7+
#include "triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h"
88
#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
99
#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
1010
#include "triton/Dialect/TritonGPU/Transforms/Utility.h"

0 commit comments

Comments
 (0)