Skip to content

Commit e94b789

Browse files
Merge commit '6d3ed0b91116e1e238a56f8b1d0d7cdaa2141911'
2 parents 4fe1e6f + 6d3ed0b commit e94b789

File tree

149 files changed

+6780
-6787
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+6780
-6787
lines changed

bin/triton-tensor-layout.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,16 @@ using namespace mlir;
2222
// clang-format off
2323
// Example usage:
2424
//
25-
// triton-tensor-layout -l "#triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>" -t "tensor<128x256xf16>"
25+
// triton-tensor-layout -l "#ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>" -t "tensor<128x256xf16>"
2626
//
2727
// triton-tensor-layout -i input.mlir -t "tensor<1x128x128xf16>" -o output.txt
2828
//
2929
// triton-tensor-layout -i input.mlir -t "tensor<1x128x128xf16>" -o output.txt -alias-names="blocked,mma" -use-hw-view
3030
//
3131
// An input file usually looks like:
3232
// '''
33-
// #mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1, 8], instrShape = [32, 32], isTransposed = false}>
34-
// #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8, 1], threadsPerWarp = [1, 16, 4], warpsPerCTA = [1, 1, 8], order = [0, 1, 2]}>
33+
// #mma = #ttg.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1, 8], instrShape = [32, 32], isTransposed = false}>
34+
// #blocked = #ttg.blocked<{sizePerThread = [1, 8, 1], threadsPerWarp = [1, 16, 4], warpsPerCTA = [1, 1, 8], order = [0, 1, 2]}>
3535
// '''
3636
// clang-format on
3737

docs/ARCHITECTURE.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ Example 1, a row-major coalesced layout may partition a 16x16 tensor over 2 warp
314314
```
315315
for
316316
```
317-
#triton_gpu.blocked_layout<{
317+
#ttg.blocked_layout<{
318318
sizePerThread = {2, 2}
319319
threadsPerWarp = {8, 4}
320320
warpsPerCTA = {1, 2}
@@ -341,7 +341,7 @@ Example 2, a row-major coalesced layout may partition a 32x32 tensor over 2 warp
341341
```
342342
for
343343
```
344-
#triton_gpu.blocked_layout<{
344+
#ttg.blocked_layout<{
345345
sizePerThread = {2, 2}
346346
threadsPerWarp = {8, 4}
347347
warpsPerCTA = {1, 2}
@@ -373,7 +373,7 @@ CTA [1,0] CTA [1,1]
373373
```
374374
for
375375
```
376-
#triton_gpu.blocked_layout<{
376+
#ttg.blocked_layout<{
377377
sizePerThread = {2, 2}
378378
threadsPerWarp = {8, 4}
379379
warpsPerCTA = {1, 2}
@@ -403,25 +403,25 @@ A single dot operator is likely to be mapped to multiple MMA instructions. For N
403403
### Layout conversion
404404
To produce the desired memory behavior described in the previous section, triton GPU introduces layouts conversion (by means of ConvertLayoutOp). An input tensor represented in a blocked layout is sliced and inserted into a shared layout, e.g.:
405405
```
406-
%61 = triton_gpu.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
407-
triton_gpu.async_commit_group
406+
%61 = ttg.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
407+
ttg.async_commit_group
408408
```
409409

410410
The main loop of the GEMM would then extract a slice (a reimplementation of tensor.extract_slice [25]) from the shared memory, converting arguments to the dot layout and producing mma layout with the dot operator, e.g.:
411411
<pre><code>
412-
%107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>) : i32 {
413-
%126 = <b>triton_gpu.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
414-
%127 = <b>triton_gpu.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
415-
%128 = <b>triton_gpu.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
416-
%129 = <b>triton_gpu.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
417-
%130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
418-
%131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
412+
%107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>) : i32 {
413+
%126 = <b>ttg.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
414+
%127 = <b>ttg.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
415+
%128 = <b>ttg.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
416+
%129 = <b>ttg.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
417+
%130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
418+
%131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
419419
...
420420
</code></pre>
421421

422422
The result of the processing is then converted back to blocked layout to be stored to the main GPU memory, e.g.:
423423
```
424-
%125 = triton_gpu.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
424+
%125 = ttg.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
425425
tt.store %117, %125, %124 {cache = 1 : i32, evict = 1 : i32} : tensor<64x128xf16, #blocked1>
426426
```
427427

@@ -454,12 +454,12 @@ Is translated to:
454454
```
455455
%a: tensor<128x32xf16, #enc>
456456
%a_tmp = tensor.extract_slice %a[0, 0] [128, 16]
457-
%a_prefetch = triton_gpu.convert_layout %a_tmp
457+
%a_prefetch = ttg.convert_layout %a_tmp
458458
scf.for %iv = ... iter_args(%a_buf = %a, ..., %a_prefetch_arg = %a_prefetch)
459459
{
460460
%x = tt.dot %a_arg, %b, %c
461461
%a_tmp_rem = tensor.extract_slice %a_buf[0, 16] [128, 16]
462-
%a_prefetch_next = triton_gpu.convert_layout %a_tmp_rem
462+
%a_prefetch_next = ttg.convert_layout %a_tmp_rem
463463
...
464464
scf.yield %next_a, ..., %a_prefetch_next
465465
}

docs/getting-started/architecture.rst

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ for
355355

356356
.. code-block:: none
357357
358-
#triton_gpu.blocked_layout<{
358+
#ttg.blocked_layout<{
359359
sizePerThread = {2, 2}
360360
threadsPerWarp = {8, 4}
361361
warpsPerCTA = {1, 2}
@@ -385,7 +385,7 @@ for
385385

386386
.. code-block:: none
387387
388-
#triton_gpu.blocked_layout<{
388+
#ttg.blocked_layout<{
389389
sizePerThread = {2, 2}
390390
threadsPerWarp = {8, 4}
391391
warpsPerCTA = {1, 2}
@@ -420,7 +420,7 @@ for
420420

421421
.. code-block:: none
422422
423-
#triton_gpu.blocked_layout<{
423+
#ttg.blocked_layout<{
424424
sizePerThread = {2, 2}
425425
threadsPerWarp = {8, 4}
426426
warpsPerCTA = {1, 2}
@@ -458,30 +458,30 @@ To produce the desired memory behavior described in the previous section, Triton
458458

459459
.. code-block:: none
460460
461-
%61 = triton_gpu.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
462-
triton_gpu.async_commit_group
461+
%61 = ttg.insert_slice_async %39, %58, %c0_i32, %60, %cst_1 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32x!tt.ptr<f16>, #blocked> -> tensor<4x64x32xf16, #shared>
462+
ttg.async_commit_group
463463
464464
465465
The main loop of the GEMM would then extract a slice (a reimplementation of tensor.extract_slice [c25]_) from the shared memory, converting arguments to the dot layout and producing mma layout with the dot operator, for example:
466466

467467
.. raw:: html
468468

469469
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>
470-
%107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>) : i32 {
471-
%126 = <b>triton_gpu.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
472-
%127 = <b>triton_gpu.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
473-
%128 = <b>triton_gpu.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
474-
%129 = <b>triton_gpu.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
475-
%130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#triton_gpu.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
476-
%131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#triton_gpu.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
470+
%107:14 = <b>scf.for</b> %arg9 = %c0_i32 to %51 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %39, %arg12 = %49, %arg13 = %94, %arg14 = %100, %arg15 = %101, %arg16 = %102, %arg17 = %85, %arg18 = %86, %arg19 = %c2_i32, %arg20 = %c3_i32, %arg21 = %c1_i32, %arg22 = %104, %arg23 = %106) -> (tensor<64x128xf32, #mma>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, tensor<4x64x32xf16, #shared>, tensor<4x32x128xf16, #shared1>, tensor<64x32xf16, #shared>, tensor<32x128xf16, #shared1>, tensor<64x32x!tt.ptr<f16>, #blocked>, tensor<32x128x!tt.ptr<f16>, #blocked1>, i32, i32, i32, tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, tensor<16x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>) : i32 {
471+
%126 = <b>ttg.extract_slice</b> %arg15[0, 16] [64, 16] [1, 1] : tensor<64x32xf16, #shared> to tensor<64x16xf16, #shared>
472+
%127 = <b>ttg.convert_layout</b> %126 : (tensor<64x16xf16, <b>#shared</b>>) -> tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>>
473+
%128 = <b>ttg.extract_slice</b> %arg16[16, 0] [16, 128] [1, 1] : tensor<32x128xf16, <b>#shared1</b>> to tensor<16x128xf16, <b>#shared1</b>>
474+
%129 = <b>ttg.convert_layout</b> %128 : (tensor<16x128xf16, #shared1>) -> tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>>
475+
%130 = tt.dot %arg22, %arg23, %arg10 {allowTF32 = true} : tensor<64x16xf16, <b>#ttg.dot_op</b><{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
476+
%131 = <b>tt.dot</b> %127, %129, %130 {allowTF32 = true} : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x128xf16, <b>#ttg.dot_op</b><{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<64x128xf32, <b>#mma</b>>
477477
...
478478
</pre></div></div>
479479

480480
The result of the processing is then converted back to blocked layout to be stored to the main GPU memory, for example:
481481

482482
.. code-block:: none
483483
484-
%125 = triton_gpu.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
484+
%125 = ttg.convert_layout %108 : (tensor<64x128xf16, #mma>) -> tensor<64x128xf16, #blocked1>
485485
tt.store %117, %125, %124 {cache = 1 : i32, evict = 1 : i32} : tensor<64x128xf16, #blocked1>
486486
487487
@@ -520,12 +520,12 @@ Is translated to:
520520
521521
%a: tensor<128x32xf16, #enc>
522522
%a_tmp = tensor.extract_slice %a[0, 0] [128, 16]
523-
%a_prefetch = triton_gpu.convert_layout %a_tmp
523+
%a_prefetch = ttg.convert_layout %a_tmp
524524
scf.for %iv = ... iter_args(%a_buf = %a, ..., %a_prefetch_arg = %a_prefetch)
525525
{
526526
%x = tt.dot %a_arg, %b, %c
527527
%a_tmp_rem = tensor.extract_slice %a_buf[0, 16] [128, 16]
528-
%a_prefetch_next = triton_gpu.convert_layout %a_tmp_rem
528+
%a_prefetch_next = ttg.convert_layout %a_tmp_rem
529529
...
530530
scf.yield %next_a, ..., %a_prefetch_next
531531
}

include/triton/Analysis/Allocation.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,8 @@ class Allocation {
180180
private:
181181
/// A class that represents a shared memory buffer
182182
struct BufferT {
183-
/// Explicit: triton_gpu.local_alloc
184-
/// Scratch: triton_gpu.convert_layout
183+
/// Explicit: ttg.local_alloc
184+
/// Scratch: ttg.convert_layout
185185
/// Virtual: triton.call
186186
enum class BufferKind { Explicit, Scratch, Virtual };
187187

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ inline Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
400400

401401
ModuleOp mod = funcOp.getOperation()->getParentOfType<ModuleOp>();
402402
auto allocSizeAttr = mod.getOperation()->getAttrOfType<mlir::IntegerAttr>(
403-
"triton_gpu.global_scratch_memory_size");
403+
"ttg.global_scratch_memory_size");
404404
if (!allocSizeAttr) {
405405
return gmemBase;
406406
}

include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ template <typename T> class OperationPass;
1212

1313
namespace triton {
1414

15-
constexpr static char AttrNumWarpsName[] = "triton_gpu.num-warps";
16-
constexpr static char AttrNumCTAsName[] = "triton_gpu.num-ctas";
17-
constexpr static char AttrTargetName[] = "triton_gpu.target";
15+
constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
16+
constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
17+
constexpr static char AttrTargetName[] = "ttg.target";
1818

19-
constexpr static char AttrNumThreadsPerWarp[] = "triton_gpu.threads-per-warp";
19+
constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
2020

2121
// Create the pass with numWarps passed from cl::opt.
2222
std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonToTritonGPUPass();

include/triton/Dialect/TritonGPU/IR/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
22

33
set(LLVM_TARGET_DEFINITIONS TritonGPUOps.td)
4-
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=triton_gpu)
5-
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=triton_gpu)
4+
mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=ttg)
5+
mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=ttg)
66
mlir_tablegen(Ops.h.inc -gen-op-decls)
77
mlir_tablegen(Ops.cpp.inc -gen-op-defs)
8-
mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=triton_gpu)
9-
mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=triton_gpu)
8+
mlir_tablegen(Types.h.inc -gen-typedef-decls -typedefs-dialect=ttg)
9+
mlir_tablegen(Types.cpp.inc -gen-typedef-defs -typedefs-dialect=ttg)
1010
add_mlir_doc(TritonGPUDialect TritonGPUDialect dialects/ -gen-dialect-doc)
1111
add_mlir_doc(TritonGPUOps TritonGPUOps dialects/ -gen-op-doc)
1212
add_public_tablegen_target(TritonGPUTableGen)

include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ Example 1, a row-major coalesced layout may partition a 16x16 tensor over 2 warp
616616

617617
for
618618

619-
#triton_gpu.blocked_layout<{
619+
#ttg.blocked_layout<{
620620
sizePerThread = {2, 2}
621621
threadsPerWarp = {8, 4}
622622
warpsPerCTA = {1, 2}
@@ -642,7 +642,7 @@ Example 2, a row-major coalesced layout may partition a 32x32 tensor over 2 warp
642642
[ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ]
643643
for
644644

645-
#triton_gpu.blocked_layout<{
645+
#ttg.blocked_layout<{
646646
sizePerThread = {2, 2}
647647
threadsPerWarp = {8, 4}
648648
warpsPerCTA = {1, 2}
@@ -672,7 +672,7 @@ CTA [1,0] CTA [1,1]
672672
[ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ] [ 28 28 29 29 30 30 31 31 ; 60 60 61 61 62 62 63 63 ]
673673
for
674674

675-
#triton_gpu.blocked_layout<{
675+
#ttg.blocked_layout<{
676676
sizePerThread = {2, 2}
677677
threadsPerWarp = {8, 4}
678678
warpsPerCTA = {1, 2}

include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
include "mlir/IR/OpBase.td"
55

66
def TritonGPU_Dialect : Dialect {
7-
let name = "triton_gpu";
7+
let name = "ttg";
88

99
let cppNamespace = "::mlir::triton::gpu";
1010

@@ -21,24 +21,24 @@ def TritonGPU_Dialect : Dialect {
2121
];
2222

2323
let extraClassDeclaration = [{
24-
static std::string getNumWarpsAttrName() { return "triton_gpu.num-warps"; }
24+
static std::string getNumWarpsAttrName() { return "ttg.num-warps"; }
2525
static int getNumWarps(ModuleOp mod) {
26-
if (!mod->hasAttr("triton_gpu.num-warps"))
26+
if (!mod->hasAttr("ttg.num-warps"))
2727
llvm::report_fatal_error(
28-
"TritonGPU module should contain a triton_gpu.num-warps attribute");
29-
return cast<IntegerAttr>(mod->getAttr("triton_gpu.num-warps")).getInt();
28+
"TritonGPU module should contain a ttg.num-warps attribute");
29+
return cast<IntegerAttr>(mod->getAttr("ttg.num-warps")).getInt();
3030
}
3131
static int getNumCTAs(ModuleOp mod) {
32-
if (!mod->hasAttr("triton_gpu.num-ctas"))
32+
if (!mod->hasAttr("ttg.num-ctas"))
3333
return 1;
34-
return cast<IntegerAttr>(mod->getAttr("triton_gpu.num-ctas")).getInt();
34+
return cast<IntegerAttr>(mod->getAttr("ttg.num-ctas")).getInt();
3535
}
3636
void registerTypes();
3737

38-
static std::string getThreadsPerWarpAttrName() { return "triton_gpu.threads-per-warp"; }
38+
static std::string getThreadsPerWarpAttrName() { return "ttg.threads-per-warp"; }
3939

4040
static int getThreadsPerWarp(ModuleOp mod) {
41-
Attribute threadsPerWarp = mod->getDiscardableAttr("triton_gpu.threads-per-warp");
41+
Attribute threadsPerWarp = mod->getDiscardableAttr("ttg.threads-per-warp");
4242
if(!threadsPerWarp) {
4343
return 32;
4444
}

0 commit comments

Comments
 (0)