Skip to content

Commit 2a7a540

Browse files
Merge commit 'b39c1e14b8f2029bc6a8798e4914d2692edf97d8'
2 parents 2c900de + b39c1e1 commit 2a7a540

File tree

119 files changed

+10989
-610
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+10989
-610
lines changed

.github/workflows/integration-tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,13 +262,13 @@ jobs:
262262
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
263263
fi
264264
cd python/test/unit
265-
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
265+
python3 -m pytest -s -n 8 --ignore=cuda/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
266266
python3 -m pytest -s -n 8 language/test_subprocess.py
267267
python3 -m pytest -s -n 8 test_debug.py --forked
268268
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
269269
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
270-
# Run hopper/test_flashattention.py separately to avoid out of gpu memory
271-
python3 -m pytest -s hopper/test_flashattention.py
270+
# Run cuda/test_flashattention.py separately to avoid out of gpu memory
271+
python3 -m pytest -s cuda/test_flashattention.py
272272
TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
273273
python3 -m pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
274274
- name: Run interpreter tests

.github/workflows/integration-tests.yml.in

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,13 +300,13 @@ jobs:
300300
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
301301
fi
302302
cd python/test/unit
303-
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
303+
python3 -m pytest -s -n 8 --ignore=cuda/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
304304
python3 -m pytest -s -n 8 language/test_subprocess.py
305305
python3 -m pytest -s -n 8 test_debug.py --forked
306306
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
307307
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
308-
# Run hopper/test_flashattention.py separately to avoid out of gpu memory
309-
python3 -m pytest -s hopper/test_flashattention.py
308+
# Run cuda/test_flashattention.py separately to avoid out of gpu memory
309+
python3 -m pytest -s cuda/test_flashattention.py
310310
TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
311311
python3 -m pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
312312
- name: Run interpreter tests

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ cmake-build-*
6666
cuobjdump
6767
nvdisasm
6868
ptxas
69+
ptxas-blackwell
6970

7071
# Third-party include
7172
third_party/nvidia/backend/include

bin/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ target_link_libraries(triton-opt PRIVATE
1313
${triton_libs}
1414
# tests
1515
TritonTestAnalysis
16+
TritonTestDialectTritonGPU
1617
# MLIR core
1718
MLIROptLib
1819
MLIRPass
@@ -31,6 +32,7 @@ target_link_libraries(triton-reduce PRIVATE
3132
${triton_libs}
3233
# tests
3334
TritonTestAnalysis
35+
TritonTestDialectTritonGPU
3436
# MLIR core
3537
MLIRReduceLib
3638
MLIRPass
@@ -48,6 +50,7 @@ target_link_libraries(triton-lsp PRIVATE
4850
${triton_libs}
4951
# tests
5052
TritonTestAnalysis
53+
TritonTestDialectTritonGPU
5154
# MLIR core
5255
MLIRLspServerLib
5356
MLIRPass
@@ -85,4 +88,5 @@ target_link_libraries(triton-tensor-layout PRIVATE
8588
${conversion_libs}
8689
${dialect_libs}
8790
TritonTestAnalysis
91+
TritonTestDialectTritonGPU
8892
)
Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
{
2+
"ptxas-blackwell": "12.8.61",
23
"ptxas": "12.4.99",
3-
"cuobjdump": "12.4.99",
4-
"nvdisasm": "12.4.99",
5-
"cudacrt": "12.4.99",
6-
"cudart": "12.4.99",
7-
"cupti": "12.4.99"
4+
"cuobjdump": "12.8.55",
5+
"nvdisasm": "12.8.55",
6+
"cudacrt": "12.8.61",
7+
"cudart": "12.8.57",
8+
"cupti": "12.8.57"
89
}

include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ namespace triton {
2525
constexpr int patternBenefitDefault = 1;
2626
constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
2727
constexpr int patternBenefitClampOptimizedPattern = 20;
28+
constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
29+
constexpr int patternBenefitNvidiaTensorCoreSubviewPattern = 20;
2830

2931
struct BackendCallbacks {
3032
/**

include/triton/Dialect/Triton/IR/TritonOps.td

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -669,14 +669,14 @@ def TT_DotOp : TT_Op<"dot", [Pure,
669669
// DotScaled Op
670670
//
671671
def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
672-
AttrSizedOperandSegments,
673-
DotLike,
674-
TypesMatchWith<"result's type matches accumulator's type",
675-
"d", "c", "$_self">]> {
672+
AttrSizedOperandSegments,
673+
DotLike,
674+
TypesMatchWith<"result's type matches accumulator's type",
675+
"d", "c", "$_self">]> {
676676
let summary = "dot_scaled";
677677

678678
let description = [{
679-
$d = matrix_multiply(scale($lhs, $lhs_scale), scale($rhs, $rhs_scale)) + $c.
679+
$d = matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale)) + $c.
680680
Where scale(x, s) is a function that applies the scale per block following microscaling spec.
681681
}];
682682

@@ -687,16 +687,15 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
687687
RankedTensorOf<[TT_Float,I8]>:$lhs,
688688
RankedTensorOf<[TT_Float,I8]>:$rhs,
689689
TT_FloatTensor:$c,
690-
Optional<RankedTensorOf<[I8]>>:$lhs_scale,
691-
Optional<RankedTensorOf<[I8]>>:$rhs_scale,
690+
Optional<RankedTensorOf<[TT_Float, I8]>>:$lhs_scale,
691+
Optional<RankedTensorOf<[TT_Float, I8]>>:$rhs_scale,
692692
TT_ScaleDotElemTypeAttr:$lhs_type,
693693
TT_ScaleDotElemTypeAttr:$rhs_type,
694694
BoolAttr:$fastMath
695695
);
696696

697697
let results = (outs TT_FloatTensor:$d);
698698

699-
// Not sure why I need to fully specify the optional group, but otherwise it complains when loading the mlir file
700699
let assemblyFormat = [{
701700
$lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? `,` $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
702701
`:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
@@ -1297,6 +1296,57 @@ def TT_ExperimentalDescriptorStoreOp : TT_Op<"experimental_descriptor_store", [
12971296
let hasVerifier = 1;
12981297
}
12991298

1299+
def TT_ExperimentalDescriptorGatherOp : TT_Op<"experimental_descriptor_gather", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
1300+
let summary = "gather multiple rows from a descriptor into a single tensor";
1301+
let description = [{
1302+
The `tt.experimental_desciptor_gather` op will be lowered to NVIDIA TMA
1303+
load operations on targets that support it.
1304+
1305+
`desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
1306+
The descriptor block must have 1 row and the indices must be a 1D tensor.
1307+
Accordingly, the result is a 2D tensor multiple rows.
1308+
1309+
This is an escape hatch and is only there for testing/experimenting. This
1310+
op will be removed in the future.
1311+
}];
1312+
1313+
let arguments = (ins
1314+
TT_TensorDescType:$desc,
1315+
RankedTensorOf<[I32]>:$x_offsets,
1316+
I32:$y_offset
1317+
);
1318+
let results = (outs TT_Tensor:$result);
1319+
1320+
let assemblyFormat = [{
1321+
$desc `[` $x_offsets `,` $y_offset `]`
1322+
attr-dict `:` functional-type(operands, results)
1323+
}];
1324+
1325+
let hasVerifier = 1;
1326+
1327+
let extraClassDeclaration = [{
1328+
// TMA gathers have resstrictions on the minimum size of the gather result.
1329+
// This function verifies the result type.
1330+
static LogicalResult verifyResultType(Operation *op, mlir::ShapedType type);
1331+
}];
1332+
}
1333+
1334+
def TT_ExperimentalDescriptorScatterOp : TT_Op<"experimental_descriptor_scatter", [
1335+
MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
1336+
]> {
1337+
let arguments = (ins
1338+
TT_TensorDescType:$desc,
1339+
RankedTensorOf<[I32]>:$x_offsets,
1340+
I32:$y_offset,
1341+
TT_Tensor:$src
1342+
);
1343+
1344+
let assemblyFormat = [{
1345+
$desc `[` $x_offsets `,` $y_offset `]` `,` $src
1346+
attr-dict `:` type(operands)
1347+
}];
1348+
}
1349+
13001350
def TT_ExperimentalTensormapCreateOp: TT_Op<
13011351
"experimental_tensormap_create",
13021352
[

include/triton/Dialect/Triton/IR/Utility.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,12 @@ template <typename VecT> bool isConsecutive(const VecT &vec) {
167167
return isConsecutive(ArrayRef(vec));
168168
}
169169

170+
template <typename T> auto seq(T start, T end, T step) {
171+
auto len = ceil<T>(end - start, step);
172+
return llvm::map_range(llvm::seq<T>(0, len),
173+
[=](T i) { return start + i * step; });
174+
}
175+
170176
} // namespace triton
171177
} // namespace mlir
172178

include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "triton/Tools/LinearLayout.h"
1010

1111
namespace mlir::triton::gpu {
12+
class SharedEncodingAttr;
1213

1314
// - BlockedEncodingAttrs have the following input dimensions.
1415
//
@@ -41,6 +42,16 @@ namespace mlir::triton::gpu {
4142
LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
4243
std::optional<int32_t> elemBitWidth = std::nullopt);
4344

45+
// Convert the shared encoding of a tensor with `hasLeadingOffset=true` to a
46+
// LinearLayout that maps from a linear shared memory offset to tensor index.
47+
//
48+
// If `disableSwizzle` is set, then the resulting layout does not include
49+
// swizzling.
50+
LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
51+
SharedEncodingAttr shared,
52+
int32_t elemBitWidth,
53+
bool disableSwizzle = false);
54+
4455
// Given a linear layout where the input dimensions contain a "block" dimension,
4556
// this method sets the "block" dimension to 0 and removes the corresponding
4657
// output dimensions.

include/triton/Dialect/TritonGPU/Transforms/Passes.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,24 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
2323
];
2424
}
2525

26+
def TritonGPUTC05MMAPipeline : Pass<"tritongpu-tc05mma-pipeline", "mlir::ModuleOp"> {
27+
let summary = "Test pass calling TC05MMA pipeline";
28+
29+
let description = [{
30+
This pass is used to test the TC05MMA pipelining under LIT. Internally it calls
31+
`getTC05MMASchedule` to get the schedule and then applies the pipelining.
32+
}];
33+
34+
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
35+
"mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
36+
"mlir::scf::SCFDialect",
37+
"mlir::arith::ArithDialect"];
38+
39+
let options = [
40+
Option<"disableExpander", "disable-expander", "bool", /*default*/"false", "Run only loop pre-process">
41+
];
42+
}
43+
2644
def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
2745
let summary = "test assigning latencies to interesting ops ahead of pipelining";
2846

0 commit comments

Comments
 (0)