Skip to content

Commit 91a9281

Browse files
committed
Merge commit '4327b5b62f2abab716f275e52f4c5535785a2ab7'
Signed-off-by: Anatoly Myachev <[email protected]>
2 parents 4779124 + 4327b5b commit 91a9281

File tree

58 files changed

+992
-446
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+992
-446
lines changed

.github/workflows/integration-tests-amd.yml

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,23 @@ jobs:
1818
matrix:
1919
runner: ${{ fromJson(inputs.matrix) }}
2020
include:
21-
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
21+
- image: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
2222
runner: ["self-hosted", "gfx90a"]
2323
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
2424
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
2525
options: >-
2626
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
2727
--volume /home/runner/.triton:/github/home/.triton
28-
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
28+
- image: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
2929
runner: ["amd-gfx942"]
3030
# We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
3131
options: >-
3232
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
3333
--env-file /etc/podinfo/gha-gpu-isolation-settings
3434
--volume /home/runner/.triton:/github/home/.triton
35-
- image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
35+
- image: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
3636
runner: ["amd-gfx950"]
37+
# We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
3738
options: >-
3839
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
3940
--env-file /etc/podinfo/gha-gpu-isolation-settings
@@ -83,14 +84,16 @@ jobs:
8384
~/.triton/nvidia
8485
~/.triton/json
8586
key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
87+
- name: Install dependencies
88+
run: apt-get install -y clang lld ccache
8689
- name: Inspect cache directories
8790
run: |
8891
mkdir -p ~/.triton
8992
du -h -d 1 ~/.triton
9093
9194
mkdir -p ~/.ccache
9295
du -h -d 1 ~/.ccache
93-
- name: Update compiler to clang
96+
- name: Update compiler to Clang
9497
run: |
9598
export CC=/usr/bin/clang
9699
export CXX=/usr/bin/clang++
@@ -100,19 +103,15 @@ jobs:
100103
echo "PATH is '$PATH'"
101104
pip uninstall -y triton pytorch-triton-rocm
102105
103-
if [ "${{ matrix.runner[0] }}" != "amd-gfx950" ]; then
104-
ccache --zero-stats
105-
fi
106-
106+
ccache --zero-stats
107107
make dev-install
108-
- name: CCache Stats
109-
if: ${{ matrix.runner[0] != 'amd-gfx950' }}
108+
- name: Print ccache stats
110109
run: ccache --print-stats
111110
- name: Run lit tests
112111
run: make test-lit
113112
- name: Run C++ unittests
114113
run: make test-cpp
115-
- name: Run python tests on AMD
114+
- name: Run Python tests on AMD
116115
run: |
117116
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
118117
if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then

include/triton/Dialect/TritonGPU/IR/Dialect.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ SetVector<int> getPartitionIds(Operation *op);
303303
SmallVector<SetVector<int>, 4> getPartitionOutputs(Operation *op);
304304
SetVector<int> getPartitionIds(OpOperand *use);
305305
bool hasPartition(Operation *op);
306+
bool hasWarpSpecializeTag(Operation *op);
307+
std::optional<int> getWarpSpecializeTag(Operation *op);
306308

307309
} // namespace mlir::triton::gpu
308310

include/triton/Dialect/TritonGPU/Transforms/Partition.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ void setPartition(Operation *op, const SetVector<Partition *> &partitions);
120120
void setPartition(Operation *op, const SetVector<int> &partitionIds);
121121
void setPartitionOutputs(Operation *op,
122122
ArrayRef<SetVector<int>> partitionOutputsIds);
123+
void setWarpSpecializeTag(Operation *op, int tag);
123124

124125
} // namespace mlir::triton::gpu
125126

include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ inline const char *getOpShape(TMemAccessAtom atom) {
107107
llvm_unreachable("Unknown TMemAccessAtom");
108108
}
109109

110-
LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom,
111-
bool unpacked);
110+
LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom, bool unpacked,
111+
bool withWarp);
112112

113113
TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
114114

include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,17 @@ def TTG_TensorMemoryEncodingAttr : AttrDef<TritonNvidiaGPU_Dialect, "TensorMemor
4040
that is, the stride between two elements in the same row.
4141
When colStride is 1 the tensor memory is packed. When colStride > 1, the
4242
tensor memory between elements is undefined.
43+
`twoCTAs` indicates that the tensor memory is laid out for twoCTA mode,
44+
i.e., `cta_group::2`.
4345
}];
4446
let parameters = (
4547
ins
4648
"unsigned":$blockM,
4749
"unsigned":$blockN,
4850
"unsigned":$colStride,
4951
DefaultValuedParameter<"unsigned", "1">:$CTASplitM,
50-
DefaultValuedParameter<"unsigned", "1">:$CTASplitN
52+
DefaultValuedParameter<"unsigned", "1">:$CTASplitN,
53+
DefaultValuedParameter<"bool", "false">:$twoCTAs
5154
);
5255
let genVerifyDecl = 1;
5356
let assemblyFormat = "`<` struct(params) `>`";

lib/Dialect/TritonGPU/IR/Dialect.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4051,3 +4051,14 @@ SetVector<int> triton::gpu::getPartitionIds(OpOperand *use) {
40514051
bool triton::gpu::hasPartition(Operation *op) {
40524052
return op && op->hasAttr(kPartitionAttrName);
40534053
}
4054+
4055+
bool triton::gpu::hasWarpSpecializeTag(Operation *op) {
4056+
return op && op->hasAttr(kWarpSpecializeTagAttrName);
4057+
}
4058+
4059+
std::optional<int> triton::gpu::getWarpSpecializeTag(Operation *op) {
4060+
if (hasWarpSpecializeTag(op)) {
4061+
return cast<IntegerAttr>(op->getAttr(kWarpSpecializeTagAttrName)).getInt();
4062+
}
4063+
return std::nullopt;
4064+
}

lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,20 +1105,39 @@ LinearLayout tensorMemoryToLinearLayout(ArrayRef<int64_t> shape,
11051105
LinearLayout::identity1D(encoding.getCTASplitN(), kCol, dims[1]);
11061106
auto newEncoding = TensorMemoryEncodingAttr::get(
11071107
ctx, encoding.getBlockM(), encoding.getBlockN(),
1108-
encoding.getColStride(), encoding.getCTASplitM(), 1);
1108+
encoding.getColStride(), encoding.getCTASplitM(), 1,
1109+
encoding.getTwoCTAs());
11091110
return tensorMemoryToLinearLayout(
11101111
{shape[0], shape[1] / encoding.getCTASplitN()}, newEncoding) *
11111112
split;
11121113
}
11131114
if (encoding.getCTASplitM() > 1) {
1114-
auto split =
1115-
LinearLayout::identity1D(encoding.getCTASplitM(), kCol, dims[0]);
1115+
auto splitM = encoding.getCTASplitM();
1116+
auto blockM = encoding.getBlockM();
1117+
bool isM64TwoCTA = blockM == 64 && encoding.getTwoCTAs();
1118+
if (isM64TwoCTA) {
1119+
// blockM == 64 and twoCTAs is laid out as the transpose of 128xblockN
1120+
// https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-b
1121+
blockM *= 2;
1122+
splitM /= 2;
1123+
}
1124+
auto split = LinearLayout::identity1D(splitM, kCol, dims[0]);
11161125
auto newEncoding = TensorMemoryEncodingAttr::get(
1117-
ctx, encoding.getBlockM(), encoding.getBlockN(),
1118-
encoding.getColStride(), 1, encoding.getCTASplitN());
1119-
return tensorMemoryToLinearLayout(
1120-
{shape[0] / encoding.getCTASplitM(), shape[1]}, newEncoding) *
1121-
split;
1126+
ctx, blockM, encoding.getBlockN(), encoding.getColStride(), 1,
1127+
encoding.getCTASplitN(), encoding.getTwoCTAs());
1128+
auto ret =
1129+
tensorMemoryToLinearLayout({shape[0] / splitM, shape[1]}, newEncoding) *
1130+
split;
1131+
// In this case, we swap the basis of the last row and last column as per
1132+
// https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-bny
1133+
if (isM64TwoCTA) {
1134+
auto bases = ret.getBases();
1135+
auto &rowBases = bases[kRow];
1136+
auto &colBases = bases[kCol];
1137+
std::swap(rowBases[rowBases.size() - 1], colBases[colBases.size() - 1]);
1138+
ret = LinearLayout(bases, ret.getOutDims(), ret.isSurjective());
1139+
}
1140+
return ret;
11221141
}
11231142
assert(encoding.getCTASplitM() == 1 && encoding.getCTASplitN() == 1);
11241143

lib/Dialect/TritonGPU/IR/Types.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,14 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
143143
<< ll.getOutDimSize(dims[0]) << "x"
144144
<< ll.getOutDimSize(dims[1]);
145145
}
146+
// Note the following holds for both M=64 and M=128 with 2CTA
147+
auto nCol = ll.getInDimSize(StringAttr::get(ctx, "col"));
148+
if (nCol / (enc.getCTASplitM() * enc.getCTASplitN()) >
149+
512 * 32 / bitwidth) {
150+
return emitError() << "nCol / (CTASplitM * CTASplitN) must be less than "
151+
"or equal to 512 * 32 / bitwidth but got "
152+
<< nCol / (enc.getCTASplitM() * enc.getCTASplitN());
153+
}
146154
} else if (auto enc = dyn_cast<SharedEncodingTrait>(encoding)) {
147155
if (memorySpace != SharedMemorySpaceAttr::get(ctx)) {
148156
return emitError()

lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ class BlockedToMMAv5 : public mlir::OpRewritePattern<DotOp> {
566566
unsigned colStride = 32 / bitwidth;
567567
Attribute accEncoding = triton::nvidia_gpu::TensorMemoryEncodingAttr::get(
568568
context, instrShape[0], instrShape[1], colStride, CTASplitNum[0],
569-
CTASplitNum[1]);
569+
CTASplitNum[1], useTwoCTAs);
570570
Attribute tensorMemorySpace =
571571
triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
572572
MemDescType accMemDescType =
@@ -847,7 +847,7 @@ class ScaledBlockedToMMAv5
847847
auto bitwidth = oldRetType.getElementType().getIntOrFloatBitWidth();
848848
unsigned colStride = 32 / bitwidth;
849849
Attribute accEncoding = triton::nvidia_gpu::TensorMemoryEncodingAttr::get(
850-
context, m, n, colStride, CTASplitNum[0], CTASplitNum[1]);
850+
context, m, n, colStride, CTASplitNum[0], CTASplitNum[1], false);
851851
Attribute tensorMemorySpace =
852852
triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
853853
MemDescType accMemDescType =

lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,9 @@ void setPartition(Operation *op, const SetVector<Partition *> &partitions) {
229229
setPartition(op, partitionIds);
230230
}
231231

232+
void setWarpSpecializeTag(Operation *op, int tag) {
233+
Builder b(op->getContext());
234+
op->setAttr(kWarpSpecializeTagAttrName, b.getI32IntegerAttr(tag));
235+
}
236+
232237
} // namespace mlir::triton::gpu

0 commit comments

Comments
 (0)