Skip to content

Commit 6ca2dda

Browse files
antiagainstyiqian1ThomasRaoux
authored
[Backend] Bump to llvm/llvm-project@bc773632355b (#7881)
* Switched `Constant{Int|Float}Op` type and value order following llvm/llvm-project@a45fda6 * Provided triple for `TargetLibraryInfoImpl` following llvm/llvm-project@c91cbaf * Fixed atomic sync scope for NVIDIA following llvm/llvm-project@0f1b16d * Updated MLIR lib names following llvm/llvm-project@e68a20e * Updated `nvvm.stmatrix` op following llvm/llvm-project@2b27377 * Updated `ROCDL::Mbcnt{Lo|Hi}Op` following llvm/llvm-project@bbe3d64 Closes triton-lang/triton#7413 Closes triton-lang/triton#7575 Closes triton-lang/triton#7765 --------- Co-authored-by: Yi Qian <[email protected]> Co-authored-by: Thomas Raoux <[email protected]>
1 parent 83fbc0e commit 6ca2dda

File tree

20 files changed

+94
-104
lines changed

20 files changed

+94
-104
lines changed

bin/CMakeLists.txt

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
1-
get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
2-
get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
31
get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
42

53
add_llvm_executable(triton-opt triton-opt.cpp PARTIAL_SOURCES_INTENDED)
64

75
# TODO: what's this?
86
llvm_update_compile_flags(triton-opt)
97
target_link_libraries(triton-opt PRIVATE
10-
${dialect_libs}
11-
${conversion_libs}
128
${triton_libs}
139
# tests
1410
TritonTestAnalysis
@@ -18,6 +14,8 @@ target_link_libraries(triton-opt PRIVATE
1814
# MLIR core
1915
MLIROptLib
2016
MLIRPass
17+
MLIRRegisterAllDialects
18+
MLIRRegisterAllPasses
2119
MLIRTransforms
2220
)
2321

@@ -28,8 +26,6 @@ mlir_check_all_link_libraries(triton-reduce)
2826

2927
llvm_update_compile_flags(triton-reduce)
3028
target_link_libraries(triton-reduce PRIVATE
31-
${dialect_libs}
32-
${conversion_libs}
3329
${triton_libs}
3430
# tests
3531
TritonTestAnalysis
@@ -39,6 +35,8 @@ target_link_libraries(triton-reduce PRIVATE
3935
# MLIR core
4036
MLIRReduceLib
4137
MLIRPass
38+
MLIRRegisterAllDialects
39+
MLIRRegisterAllPasses
4240
MLIRTransforms
4341
)
4442

@@ -48,8 +46,6 @@ add_llvm_executable(triton-lsp triton-lsp.cpp PARTIAL_SOURCES_INTENDED)
4846

4947
llvm_update_compile_flags(triton-lsp)
5048
target_link_libraries(triton-lsp PRIVATE
51-
${dialect_libs}
52-
${conversion_libs}
5349
${triton_libs}
5450
# tests
5551
TritonTestAnalysis
@@ -59,6 +55,8 @@ target_link_libraries(triton-lsp PRIVATE
5955
# MLIR core
6056
MLIRLspServerLib
6157
MLIRPass
58+
MLIRRegisterAllDialects
59+
MLIRRegisterAllPasses
6260
MLIRTransforms
6361
)
6462

@@ -88,10 +86,11 @@ export_executable_symbols_for_plugins(triton-llvm-opt)
8886
add_llvm_executable(triton-tensor-layout triton-tensor-layout.cpp PARTIAL_SOURCES_INTENDED)
8987
target_link_libraries(triton-tensor-layout PRIVATE
9088
${triton_libs}
91-
${conversion_libs}
92-
${dialect_libs}
9389
TritonTestAnalysis
9490
TritonTestDialect
9591
TritonTestProton
9692
TritonAMDGPUTestAnalysis
93+
MLIRRegisterAllDialects
94+
MLIRRegisterAllPasses
95+
MLIRTransforms
9796
)

cmake/llvm-hash.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
570885128351868c1308bb22e8ca351d318bc4a1
1+
bc773632355b3cebde350b0341624e88be40b744

include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,6 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
100100
DefaultValuedAttr<BoolAttr, "false">:$isVolatile
101101
);
102102

103-
let builders = [
104-
OpBuilder<(ins "Value":$src, "Value":$result,
105-
"triton::CacheModifier":$cache,
106-
"triton::EvictionPolicy":$evict, "bool":$isVolatile)>,
107-
];
108-
109103
let results = (outs TTG_AsyncToken:$token);
110104

111105
let extraClassDeclaration = [{
@@ -395,9 +389,6 @@ def TTG_MaskOp: TTG_Op<"mask",
395389
let arguments = (ins I1:$pred);
396390
let results = (outs Variadic<AnyType>:$result);
397391
let regions = (region SizedRegion<1>:$region);
398-
let builders = [
399-
OpBuilder<(ins "Value":$pred)>,
400-
];
401392
}
402393

403394
def TTG_MaskReturnOp: TTG_Op<"mask.return",

lib/Dialect/Triton/Transforms/RewriteTensorDescriptorToPointer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ Value generateMaskFromOffsetRanges(OpBuilder &builder, const Location &loc,
166166

167167
// Compare with lower bound
168168
Value lowerBound = builder.create<mlir::arith::ConstantIntOp>(
169-
loc, 0, builder.getI64Type());
169+
loc, builder.getI64Type(), 0);
170170
Value splatLowerBound = builder.create<triton::SplatOp>(
171171
loc, offsetWithRange.getType(), lowerBound);
172172
Value cmpLower = builder.create<arith::CmpIOp>(

lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ struct RewritedInfo {
135135

136136
// Compare with lower bound
137137
Value lowerBound = builder.create<mlir::arith::ConstantIntOp>(
138-
loc, 0, builder.getI64Type());
138+
loc, builder.getI64Type(), 0);
139139
Value splatLowerBound = builder.create<triton::SplatOp>(
140140
loc, offsetWithRange.getType(), lowerBound);
141141
Value cmpLower = builder.create<arith::CmpIOp>(

lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@ static void expandLoops(ModuleOp moduleOp) {
6767
if (isEpilogue) {
6868
// Return false for the predicate of the peeled iteration
6969
return rewriter.create<mlir::arith::ConstantIntOp>(
70-
predOp.getLoc(), 0, predOp.getResult().getType());
70+
predOp.getLoc(), predOp.getResult().getType(), 0);
7171
} else {
7272
if (predOp.getStage() == predOp.getMaxStage() - 1) {
7373
return rewriter.create<mlir::arith::ConstantIntOp>(
74-
predOp.getLoc(), 1, predOp.getResult().getType());
74+
predOp.getLoc(), predOp.getResult().getType(), 1);
7575
} else {
7676
OpBuilder::InsertionGuard guard(rewriter);
7777
rewriter.setInsertionPoint(op);

python/src/ir.cc

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -790,53 +790,53 @@ void init_triton_ir(py::module &&m) {
790790
.def("get_int1",
791791
[](TritonOpBuilder &self, bool v) -> Value {
792792
return Value(self.create<arith::ConstantIntOp>(
793-
v, self.getBuilder().getI1Type()));
793+
self.getBuilder().getI1Type(), v));
794794
})
795795
.def("get_int8",
796796
[](TritonOpBuilder &self, int64_t v) -> Value {
797797
return Value(self.create<arith::ConstantIntOp>(
798-
v, self.getBuilder().getI8Type()));
798+
self.getBuilder().getI8Type(), v));
799799
})
800800
.def("get_int16",
801801
[](TritonOpBuilder &self, int64_t v) -> Value {
802802
return Value(self.create<arith::ConstantIntOp>(
803-
v, self.getBuilder().getI16Type()));
803+
self.getBuilder().getI16Type(), v));
804804
})
805805
.def("get_int32",
806806
[](TritonOpBuilder &self, int64_t v) -> Value {
807807
return Value(self.create<arith::ConstantIntOp>(
808-
v, self.getBuilder().getI32Type()));
808+
self.getBuilder().getI32Type(), v));
809809
})
810810
.def("get_int64",
811811
[](TritonOpBuilder &self, int64_t v) -> Value {
812812
return Value(self.create<arith::ConstantIntOp>(
813-
v, self.getBuilder().getI64Type()));
813+
self.getBuilder().getI64Type(), v));
814814
})
815815
.def("get_uint8",
816816
[](TritonOpBuilder &self, uint64_t v) -> Value {
817817
return Value(self.create<arith::ConstantIntOp>(
818-
v, self.getBuilder().getI8Type()));
818+
self.getBuilder().getI8Type(), v));
819819
})
820820
.def("get_uint16",
821821
[](TritonOpBuilder &self, uint64_t v) -> Value {
822822
return Value(self.create<arith::ConstantIntOp>(
823-
v, self.getBuilder().getI16Type()));
823+
self.getBuilder().getI16Type(), v));
824824
})
825825
.def("get_uint32",
826826
[](TritonOpBuilder &self, uint64_t v) -> Value {
827827
return Value(self.create<arith::ConstantIntOp>(
828-
v, self.getBuilder().getI32Type()));
828+
self.getBuilder().getI32Type(), v));
829829
})
830830
.def("get_uint64",
831831
[](TritonOpBuilder &self, uint64_t v) -> Value {
832832
return Value(self.create<arith::ConstantIntOp>(
833-
v, self.getBuilder().getI64Type()));
833+
self.getBuilder().getI64Type(), v));
834834
})
835835
.def("get_bf16",
836836
[](TritonOpBuilder &self, float v) -> Value {
837837
auto type = self.getBuilder().getBF16Type();
838838
return self.create<arith::ConstantFloatOp>(
839-
APFloat(type.getFloatSemantics(), std::to_string(v)), type);
839+
type, APFloat(type.getFloatSemantics(), std::to_string(v)));
840840
})
841841
.def("get_fp16",
842842
[](TritonOpBuilder &self, float v) -> Value {
@@ -857,17 +857,17 @@ void init_triton_ir(py::module &&m) {
857857
[](TritonOpBuilder &self, Type type) -> Value {
858858
if (auto floatTy = dyn_cast<FloatType>(type))
859859
return self.create<arith::ConstantFloatOp>(
860-
APFloat(floatTy.getFloatSemantics(), 0), floatTy);
860+
floatTy, APFloat(floatTy.getFloatSemantics(), 0));
861861
else if (auto intTy = dyn_cast<IntegerType>(type))
862-
return self.create<arith::ConstantIntOp>(0, intTy);
862+
return self.create<arith::ConstantIntOp>(intTy, 0);
863863
else
864864
throw std::runtime_error("Not implemented");
865865
})
866866
.def("get_all_ones_value",
867867
[](TritonOpBuilder &self, Type type) -> Value {
868868
uint64_t val = 0xFFFFFFFFFFFFFFFF;
869869
if (auto intTy = dyn_cast<IntegerType>(type))
870-
return self.create<arith::ConstantIntOp>(val, intTy);
870+
return self.create<arith::ConstantIntOp>(intTy, val);
871871
else
872872
throw std::runtime_error("Not implemented");
873873
})

python/src/llvm.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ void init_triton_llvm(py::module &&m) {
320320
ModuleAnalysisManager mam;
321321

322322
if (arch.empty()) {
323-
llvm::TargetLibraryInfoImpl TLII;
323+
llvm::TargetLibraryInfoImpl TLII(mod->getTargetTriple());
324324
TLII.disableAllFunctions();
325325
fam.registerPass([TLII = std::move(TLII)] {
326326
return llvm::TargetLibraryAnalysis(TLII);

python/test/unit/language/test_core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1576,7 +1576,7 @@ def kernel(X, Z):
15761576
# atom.add.bf16 is unsupported prior to Hopper so instead we generate an
15771577
# atom.cas add loop on Ampere and prior
15781578
if dst_type == 'bfloat16' and torch.cuda.get_device_capability()[0] < 9:
1579-
assert f"atom.{sem_str}.global.cas" in h.asm["ptx"]
1579+
assert f"atom.{sem_str}.gpu.global.cas" in h.asm["ptx"]
15801580
return
15811581

15821582
assert f"atom.global.gpu.{sem_str}" in h.asm["ptx"]

python/test/unit/language/test_line_info.py

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -262,25 +262,26 @@ def kernel_basic(src, N, BLOCK_SIZE: tl.constexpr):
262262
# CHECK: #loc = loc("{{.*}}":261:0)
263263
# CHECK-LABEL: tt.func public @kernel_basic(
264264
# CHECK-SAME: %src: !tt.ptr<f32> loc("src"(#loc)), %N: i32 loc("N"(#loc)))
265-
# CHECK: %cst = arith.constant dense<1.000000e+00> : tensor<16xf32> loc(#loc1)
266-
# CHECK: %c16_i32 = arith.constant 16 : i32 loc(#loc1)
267-
# CHECK: %pid = tt.get_program_id x : i32 loc(#loc14)
268-
# CHECK: %offset = arith.muli %pid, %c16_i32 : i32 loc(#loc15)
269-
# CHECK: %offsets = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc16)
270-
# CHECK: %offsets_0 = tt.splat %offset : i32 -> tensor<16xi32> loc(#loc17)
271-
# CHECK: %offsets_1 = arith.addi %offsets_0, %offsets : tensor<16xi32> loc(#loc17)
272-
# CHECK: %load_src_store_dst = tt.splat %src : !tt.ptr<f32> -> tensor<16x!tt.ptr<f32>> loc(#loc18)
273-
# CHECK: %load_src_store_dst_2 = tt.addptr %load_src_store_dst, %offsets_1 : tensor<16x!tt.ptr<f32>>, tensor<16xi32> loc(#loc18)
274-
# CHECK: %mask = tt.splat %N : i32 -> tensor<16xi32> loc(#loc19)
275-
# CHECK: %mask_3 = arith.cmpi slt, %offsets_1, %mask : tensor<16xi32> loc(#loc19)
276-
# CHECK: %x_plus_1 = tt.load %load_src_store_dst_2, %mask_3 : tensor<16x!tt.ptr<f32>> loc(#loc20)
277-
# CHECK: %x_plus_1_4 = arith.addf %x_plus_1, %cst : tensor<16xf32> loc(#loc21)
278-
# CHECK: tt.store %load_src_store_dst_2, %x_plus_1_4, %mask_3 : tensor<16x!tt.ptr<f32>> loc(#loc10)
265+
# CHECK: %x_plus_1 = arith.constant dense<1.000000e+00> : tensor<16xf32> loc(#loc14)
266+
# CHECK: %c16_i32 = arith.constant 16 : i32 loc(#loc2)
267+
# CHECK: %pid = tt.get_program_id x : i32 loc(#loc15)
268+
# CHECK: %offset = arith.muli %pid, %c16_i32 : i32 loc(#loc16)
269+
# CHECK: %offsets = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc17)
270+
# CHECK: %offsets_0 = tt.splat %offset : i32 -> tensor<16xi32> loc(#loc18)
271+
# CHECK: %offsets_1 = arith.addi %offsets_0, %offsets : tensor<16xi32> loc(#loc18)
272+
# CHECK: %load_src_store_dst = tt.splat %src : !tt.ptr<f32> -> tensor<16x!tt.ptr<f32>> loc(#loc19)
273+
# CHECK: %load_src_store_dst_2 = tt.addptr %load_src_store_dst, %offsets_1 : tensor<16x!tt.ptr<f32>>, tensor<16xi32> loc(#loc19)
274+
# CHECK: %mask = tt.splat %N : i32 -> tensor<16xi32> loc(#loc20)
275+
# CHECK: %mask_3 = arith.cmpi slt, %offsets_1, %mask : tensor<16xi32> loc(#loc20)
276+
# CHECK: %x_plus_1_4 = tt.load %load_src_store_dst_2, %mask_3 : tensor<16x!tt.ptr<f32>> loc(#loc21)
277+
# CHECK: %x_plus_1_5 = arith.addf %x_plus_1_4, %x_plus_1 : tensor<16xf32> loc(#loc14)
278+
# CHECK: tt.store %load_src_store_dst_2, %x_plus_1_5, %mask_3 : tensor<16x!tt.ptr<f32>> loc(#loc10)
279279
# CHECK: tt.return loc(#loc11)
280-
# CHECK: } loc(#loc)
280+
# CHECK: } loc(#loc)
281+
# CHECK: } loc(#loc)
281282

282-
# CHECK: #loc1 = loc(unknown)
283-
# CHECK: #loc2 = loc({{.*}})
283+
# CHECK: #loc1 = loc({{.*}})
284+
# CHECK: #loc2 = loc(unknown)
284285
# CHECK: #loc3 = loc({{.*}})
285286
# CHECK: #loc4 = loc({{.*}})
286287
# CHECK: #loc5 = loc({{.*}})
@@ -290,13 +291,13 @@ def kernel_basic(src, N, BLOCK_SIZE: tl.constexpr):
290291
# CHECK: #loc9 = loc({{.*}})
291292
# CHECK: #loc10 = loc({{.*}})
292293
# CHECK: #loc11 = loc({{.*}})
293-
# CHECK: #loc14 = loc("pid"(#loc2))
294-
# CHECK: #loc15 = loc("offset"(#loc3))
295-
# CHECK: #loc16 = loc("offsets"(#loc4))
294+
# CHECK: #loc14 = loc("x_plus_1"(#loc1))
295+
# CHECK: #loc15 = loc("pid"(#loc3))
296+
# CHECK: #loc16 = loc("offset"(#loc4))
296297
# CHECK: #loc17 = loc("offsets"(#loc5))
297-
# CHECK: #loc18 = loc("load_src_store_dst"(#loc6))
298-
# CHECK: #loc19 = loc("mask"(#loc7))
299-
# CHECK: #loc20 = loc("x_plus_1"(#loc8))
298+
# CHECK: #loc18 = loc("offsets"(#loc6))
299+
# CHECK: #loc19 = loc("load_src_store_dst"(#loc7))
300+
# CHECK: #loc20 = loc("mask"(#loc8))
300301
# CHECK: #loc21 = loc("x_plus_1"(#loc9))
301302

302303
pid = tl.program_id(0)
@@ -404,20 +405,20 @@ def kernel_basic_while(N):
404405
# CHECK: %arange = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
405406
arange = tl.arange(0, 16)
406407
ivar = 0
407-
# CHECK: %ivar:2 = scf.while (%arange_0 = %arange, %ivar_1 = %c0_i32) : (tensor<16xi32>, i32) -> (tensor<16xi32>, i32)
408-
# CHECK: %[[COND:.*]] = arith.cmpi slt, %ivar_1, %N : i32
409-
# CHECK: scf.condition(%[[COND]]) %arange_0, %ivar_1 : tensor<16xi32>, i32
408+
# CHECK: %ivar_[[IV0:.+]]:2 = scf.while (%arange_[[AR0:.+]] = %arange, %ivar_[[IV1:.+]] = %ivar) : (tensor<16xi32>, i32) -> (tensor<16xi32>, i32)
409+
# CHECK: %[[COND:.*]] = arith.cmpi slt, %ivar_[[IV1]], %N : i32
410+
# CHECK: scf.condition(%[[COND]]) %arange_[[AR0]], %ivar_[[IV1]] : tensor<16xi32>, i32
410411
while ivar < N:
411-
# CHECK: ^bb0(%arange_0: tensor<16xi32> loc("arange"), %ivar_1: i32
412+
# CHECK: ^bb0(%arange_[[AR0]]: tensor<16xi32> loc("arange"), %ivar_[[IV1]]: i32
412413

413-
# CHECK: %ivar_2 = arith.addi %ivar_1, %c1_i32 : i32
414+
# CHECK: %ivar_[[IV2:.+]] = arith.addi %ivar_[[IV1]], %c1_i32 : i32
414415
ivar += 1
415-
# CHECK: %arange_3 = tt.splat %ivar_2 : i32 -> tensor<16xi32>
416-
# CHECK: %arange_4 = arith.muli %arange_0, %arange_3 : tensor<16xi32>
417-
# CHECK: scf.yield %arange_4, %ivar_2 : tensor<16xi32>, i32
416+
# CHECK: %arange_[[AR1:.+]] = tt.splat %ivar_[[IV2]] : i32 -> tensor<16xi32>
417+
# CHECK: %arange_[[AR2:.+]] = arith.muli %arange_[[AR0]], %arange_[[AR1]] : tensor<16xi32>
418+
# CHECK: scf.yield %arange_[[AR2]], %ivar_[[IV2]] : tensor<16xi32>, i32
418419
arange *= ivar
419420

420-
# CHECK: tt.print ": " {hex = false, isSigned = array<i32: 1>} : %ivar#0 : tensor<16xi32>
421+
# CHECK: tt.print ": " {hex = false, isSigned = array<i32: 1>} : %ivar_[[IV0]]#0 : tensor<16xi32>
421422
tl.device_print("", arange)
422423

423424
h = triton.compile(triton.compiler.ASTSource(fn=kernel_basic_while, signature={"N": "i32"}, constexprs={}))

0 commit comments

Comments
 (0)