Skip to content

Commit f11c5ba

Browse files
jataylopeterbell10htyuantiagainstjungpark-mlir
authored
[rc/3.2.x] LLVM bump for gfx950 target support (#5417)
This PR brings in required LLVM bumps and additional targets for gfx950 support. - #5040 - #5064 - #5180 - #5242 - #5392 Note this PR reverts the last two PRs to only focus on the LLVM upgrade - #5347 - #5191 --------- Co-authored-by: peterbell10 <[email protected]> Co-authored-by: Hongtao Yu <[email protected]> Co-authored-by: Lei Zhang <[email protected]> Co-authored-by: Jungwook Park <[email protected]>
1 parent 7e401df commit f11c5ba

File tree

14 files changed

+240
-681
lines changed

14 files changed

+240
-681
lines changed

cmake/llvm-hash.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1f20eee6dc367bd202895e3eedb03974a628ef16
1+
86b69c31642e98f8357df62c09d118ad1da4e16a

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -727,10 +727,6 @@ def TT_ReduceOp: TT_Op<"reduce",
727727
llvm::SmallVector<RankedTensorType> getInputTypes();
728728
llvm::SmallVector<Type> getElementTypes();
729729
unsigned getNumOperands();
730-
731-
// Returns the CombineOp iff this ReduceOp's region contains only
732-
// one CombineOp other than the return, or nullptr if not applicable.
733-
::mlir::Operation *getSingleCombiner();
734730
}];
735731
}
736732

lib/Dialect/Triton/IR/Ops.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -503,22 +503,6 @@ llvm::SmallVector<Type> ReduceOp::getElementTypes() {
503503
return getElementTypesImpl(this->getOperands());
504504
}
505505

506-
::mlir::Operation *ReduceOp::getSingleCombiner() {
507-
if (getNumOperands() != 1 || getNumResults() != 1)
508-
return nullptr;
509-
Block *block = &(*getCombineOp().begin());
510-
Operation *yield = block->getTerminator();
511-
Operation *reduceOp = yield->getOperand(0).getDefiningOp();
512-
if (!reduceOp || reduceOp->getNumOperands() != 2 ||
513-
reduceOp->getNumResults() != 1)
514-
return nullptr;
515-
if (reduceOp->getOperand(0) != block->getArgument(0) ||
516-
reduceOp->getOperand(1) != block->getArgument(1))
517-
return nullptr;
518-
519-
return reduceOp;
520-
}
521-
522506
unsigned ReduceOp::getNumOperands() { return this->getOperands().size(); }
523507

524508
//-- ScanOp --

lib/Dialect/Triton/Transforms/Combine.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def CombineDotAddIPattern : Pat<
1717
[(Constraint<CPred<"isZero($0)">> $c),
1818
(Constraint<CPred<"res->hasOneUse()">, "dot result has a single use">)]>;
1919
def CombineDotAddFPattern : Pat<
20-
(Arith_AddFOp $d, (TT_DotOp:$res $a, $b, $c, $inputPrecision, $maxNumImpreciseAcc), $fastmath, $denorm),
20+
(Arith_AddFOp $d, (TT_DotOp:$res $a, $b, $c, $inputPrecision, $maxNumImpreciseAcc), $fastmath),
2121
(TT_DotOp $a, $b, $d, $inputPrecision, $maxNumImpreciseAcc, (location $res)),
2222
[(Constraint<CPred<"isZero($0)">> $c),
2323
(Constraint<CPred<"::llvm::cast<::mlir::IntegerAttr>($0).getInt() == 0">> $maxNumImpreciseAcc),
@@ -29,7 +29,7 @@ def CombineDotAddIRevPattern : Pat<
2929
[(Constraint<CPred<"isZero($0)">> $c),
3030
(Constraint<CPred<"res->hasOneUse()">, "dot result has a single use">)]>;
3131
def CombineDotAddFRevPattern : Pat<
32-
(Arith_AddFOp (TT_DotOp:$res $a, $b, $c, $inputPrecision, $maxNumImpreciseAcc), $d, $fastmath, $denorm),
32+
(Arith_AddFOp (TT_DotOp:$res $a, $b, $c, $inputPrecision, $maxNumImpreciseAcc), $d, $fastmath),
3333
(TT_DotOp $a, $b, $d, $inputPrecision, $maxNumImpreciseAcc, (location $res)),
3434
[(Constraint<CPred<"isZero($0)">> $c),
3535
(Constraint<CPred<"::llvm::cast<::mlir::IntegerAttr>($0).getInt() == 0">> $maxNumImpreciseAcc),

test/Conversion/amd/tritongpu_to_llvm.mlir

Lines changed: 0 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -62,111 +62,3 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
6262
tt.return
6363
}
6464
}
65-
66-
// -----
67-
68-
#blocked1 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
69-
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
70-
// CHECK-LABEL: atomic_add_f16
71-
tt.func @atomic_add_f16(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1 : tensor<256xi1, #blocked1>, %arg2 : tensor<256xf16, #blocked1>) {
72-
%range = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked1>
73-
%base_ptr = tt.splat %arg0 : !tt.ptr<f16> -> tensor<256x!tt.ptr<f16>, #blocked1>
74-
%ptr = tt.addptr %base_ptr, %range : tensor<256x!tt.ptr<f16>, #blocked1>, tensor<256xi32, #blocked1>
75-
// CHECK: llvm.cond_br
76-
// CHECK: llvm.atomicrmw fadd {{.*}} vector<2xf16>
77-
%0 = tt.atomic_rmw fadd, relaxed, gpu, %ptr, %arg2, %arg1 : (tensor<256x!tt.ptr<f16>, #blocked1>, tensor<256xf16, #blocked1>, tensor<256xi1, #blocked1>) -> tensor<256xf16, #blocked1>
78-
tt.return
79-
}
80-
}
81-
82-
// -----
83-
84-
#blocked2 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
85-
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
86-
// CHECK-LABEL: atomic_add_bf16
87-
tt.func @atomic_add_bf16(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}, %arg1 : tensor<256xi1, #blocked2>, %arg2 : tensor<256xbf16, #blocked2>) {
88-
%range = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked2>
89-
%base_ptr = tt.splat %arg0 : !tt.ptr<bf16> -> tensor<256x!tt.ptr<bf16>, #blocked2>
90-
%ptr = tt.addptr %base_ptr, %range : tensor<256x!tt.ptr<bf16>, #blocked2>, tensor<256xi32, #blocked2>
91-
// CHECK: llvm.cond_br
92-
// CHECK: llvm.atomicrmw fadd {{.*}} vector<2xbf16>
93-
%0 = tt.atomic_rmw fadd, relaxed, gpu, %ptr, %arg2, %arg1 : (tensor<256x!tt.ptr<bf16>, #blocked2>, tensor<256xbf16, #blocked2>, tensor<256xi1, #blocked2>) -> tensor<256xbf16, #blocked2>
94-
tt.return
95-
}
96-
}
97-
98-
// -----
99-
100-
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
101-
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
102-
// CHECK-LABEL: reduce_dpp_max
103-
tt.func @reduce_dpp_max(%arg0: tensor<64xf32, #blocked3>) {
104-
// CHECK: rocdl.update.dpp
105-
// CHECK-SAME: with 280, 15, 15, true : f32
106-
// CHECK-NEXT: llvm.intr.maxnum
107-
108-
// CHECK-NEXT: rocdl.update.dpp
109-
// CHECK-SAME: with 276, 15, 15, true : f32
110-
// CHECK-NEXT: llvm.intr.maxnum
111-
112-
// CHECK-NEXT: rocdl.update.dpp
113-
// CHECK-SAME: with 274, 15, 15, true : f32
114-
// CHECK-NEXT: llvm.intr.maxnum
115-
116-
// CHECK-NEXT: rocdl.update.dpp
117-
// CHECK-SAME: with 273, 15, 15, true : f32
118-
// CHECK-NEXT: llvm.intr.maxnum
119-
120-
// CHECK-NEXT: rocdl.update.dpp
121-
// CHECK-SAME: with 322, 10, 15, true : f32
122-
// CHECK-NEXT: llvm.intr.maxnum
123-
124-
// CHECK-NEXT: rocdl.update.dpp
125-
// CHECK-SAME: with 323, 15, 15, true : f32
126-
// CHECK-NEXT: llvm.intr.maxnum
127-
128-
// CHECK: llvm.amdgcn.readlane
129-
%0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
130-
^bb0(%arg1: f32, %arg2: f32):
131-
%1 = arith.maxnumf %arg1, %arg2 : f32
132-
tt.reduce.return %1 : f32
133-
}) : (tensor<64xf32, #blocked3>) -> f32
134-
tt.return
135-
}
136-
}
137-
138-
// -----
139-
140-
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
141-
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
142-
// CHECK-LABEL: reduce_xor_max
143-
tt.func @reduce_xor_max(%arg0: tensor<32xf32, #blocked4>) {
144-
// CHECK: rocdl.ds_swizzle
145-
// CHECK: llvm.intr.maxnum
146-
147-
// CHECK: rocdl.update.dpp
148-
// CHECK-SAME: with 280, 15, 12, false : i32
149-
// CHECK: rocdl.update.dpp
150-
// CHECK-SAME: with 264, 15, 3, false : i32
151-
// CHECK: llvm.intr.maxnum
152-
153-
// CHECK: rocdl.update.dpp
154-
// CHECK-SAME: with 276, 15, 10, false : i32
155-
// CHECK: rocdl.update.dpp
156-
// CHECK-SAME: with 260, 15, 5, false : i32
157-
// CHECK: llvm.intr.maxnum
158-
159-
// CHECK: rocdl.update.dpp
160-
// CHECK-SAME: with 78, 15, 15, false : i32
161-
// CHECK: llvm.intr.maxnum
162-
163-
// CHECK: rocdl.update.dpp
164-
// CHECK-SAME: with 177, 15, 15, false : i32
165-
%0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
166-
^bb0(%arg1: f32, %arg2: f32):
167-
%1 = arith.maxnumf %arg1, %arg2 : f32
168-
tt.reduce.return %1 : f32
169-
}) : (tensor<32xf32, #blocked4>) -> f32
170-
tt.return
171-
}
172-
}

third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include "mlir/IR/Dialect.h"
3131
#include "mlir/IR/PatternMatch.h"
3232
#include "triton/Dialect/Triton/IR/Traits.h"
33-
3433
// clang-format off
3534
#include "amd/include/Dialect/TritonAMDGPU/IR/Dialect.h.inc"
3635
// clang-format on

third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,6 @@ enum class ISAFamily {
1919
// Deduces the corresponding ISA family for the given target gfx |arch|.
2020
ISAFamily deduceISAFamily(llvm::StringRef arch);
2121

22-
// Here is a partial definition of DppCtrl enums. For the complete definition,
23-
// please check:
24-
// https://github.com/llvm/llvm-project/blob/8c75290/llvm/lib/Target/AMDGPU/SIDefines.h#L939
25-
enum class DppCtrl : uint32_t {
26-
QUAD_PERM_FIRST = 0,
27-
ROW_SHL0 = 0x100,
28-
ROW_SHR0 = 0x110,
29-
BCAST15 = 0x142,
30-
BCAST31 = 0x143
31-
};
32-
3322
} // namespace mlir::triton::AMD
3423

3524
#endif // TRITON_CONVERSION_TRITONGPU_TO_LLVM_TARGETUTILS_H

third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -768,11 +768,7 @@ struct AtomicRMWOpConversion
768768
// tensor
769769
if (tensorTy) {
770770
auto valTy = cast<RankedTensorType>(val.getType());
771-
Type elTy = valTy.getElementType();
772-
vec = std::min<unsigned>(vec, llvm::isa<FloatType>(elTy) &&
773-
elTy.getIntOrFloatBitWidth() == 16
774-
? 2
775-
: 1);
771+
vec = std::min<unsigned>(vec, valTy.getElementType().isF16() ? 2 : 1);
776772
// mask
777773
numElems = tensorTy.getNumElements();
778774
}
@@ -787,22 +783,13 @@ struct AtomicRMWOpConversion
787783
auto vecTy = vec_ty(valueElemTy, vec);
788784
auto retType = vec == 1 ? valueElemTy : vecTy;
789785
SmallVector<Value> resultVals(elemsPerThread);
786+
const bool f16v2 = vec == 2 && valueElemTy.isF16();
790787
for (size_t i = 0; i < elemsPerThread; i += vec) {
791788
Value rmwPtr = ptrElements[i];
792789
// TODO: in case llMask is zero we can create only one branch for all
793790
// elemsPerThread.
794791
Value rmwMask = llMask ? and_(mask, maskElements[i]) : mask;
795792

796-
Value operand;
797-
if (vec == 1) {
798-
operand = valElements[i];
799-
} else {
800-
operand = undef(vecTy);
801-
for (size_t ii = 0; ii < vec; ++ii)
802-
operand =
803-
insert_element(vecTy, operand, valElements[i + ii], i32_val(ii));
804-
}
805-
806793
Value undefVal = undef(retType);
807794
// Build blocks to bypass the atomic instruction for ~rmwMask.
808795
auto *curBlock = rewriter.getInsertionBlock();
@@ -819,11 +806,25 @@ struct AtomicRMWOpConversion
819806
auto maybeKind = matchAtomicOp(atomicRmwAttr);
820807
// TODO: use rocdl.raw.buffer.atomic from ROCDL dialect to use efficient
821808
// atomics for MI-* series of AMD GPU.
822-
Value atom =
823-
rewriter
824-
.create<LLVM::AtomicRMWOp>(loc, *maybeKind, rmwPtr, operand,
825-
atomicMemOrdering, StringRef("agent"))
826-
.getResult();
809+
Value atom = rewriter
810+
.create<LLVM::AtomicRMWOp>(
811+
loc, *maybeKind, rmwPtr, valElements[i],
812+
atomicMemOrdering, StringRef("agent"))
813+
.getResult();
814+
815+
// NV for the f16v2 case generates one packed instruction. We have to
816+
// create two separate instructions since LLVM::AtomicRMWOp doesn't
817+
// support this. Can be optimized out with rocdl.raw.buffer.atomic.
818+
if (f16v2) {
819+
Value atom2 =
820+
rewriter
821+
.create<LLVM::AtomicRMWOp>(
822+
loc, *maybeKind, ptrElements[i + 1], valElements[i + 1],
823+
atomicMemOrdering, StringRef("agent"))
824+
.getResult();
825+
auto tmp = insert_element(vecTy, undef(vecTy), atom, i32_val(0));
826+
atom = insert_element(vecTy, tmp, atom2, i32_val(1)).getResult();
827+
}
827828
if (!tensorTy) {
828829
if (atomicNeedsSharedMemory(op.getResult())) {
829830
Value atomPtr =

0 commit comments

Comments
 (0)