Skip to content

Commit 33869db

Browse files
authored
[NFC] Split Membar tests into common and ttng specific files (#6637)
Splitting the tests allows us to reuse the common tests when adding a custom filter function to `Membar` analysis in the `amdgpu` backend in a follow-up PR.
1 parent 76045c8 commit 33869db

File tree

2 files changed

+144
-140
lines changed

2 files changed

+144
-140
lines changed
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// RUN: triton-opt %s -split-input-file --convert-scf-to-cf --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefixes=CHECK,CF
2+
// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefixes=CHECK,SCF
3+
4+
#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
5+
#A_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
6+
7+
module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
8+
// CHECK-LABEL: @async_store_wait
9+
tt.func @async_store_wait(%arg: tensor<32x16xf16, #AL>) {
10+
%alloc = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
11+
// CHECK: async_tma_store_wait
12+
ttng.async_tma_store_wait {pendings = 0 : i32}
13+
// CHECK-NEXT: gpu.barrier
14+
// CHECK-NEXT: ttg.local_store
15+
ttg.local_store %arg, %alloc : tensor<32x16xf16, #AL> -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
16+
tt.return
17+
}
18+
}
19+
20+
// -----
21+
22+
#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
23+
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
24+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
25+
26+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 18944 : i32} {
27+
// CHECK-LABEL: tma_special_cases
28+
tt.func @tma_special_cases(%arg1: !tt.ptr<i8, 0>) -> (tensor<256x64xf16, #blocked>){
29+
%true = arith.constant 1 : i1
30+
%cx = arith.constant dense<1> : tensor<32xi32>
31+
%c0 = arith.constant 0 : i32
32+
%barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
33+
%alloc = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
34+
// CHECK: ttng.init_barrier
35+
// CHECK-NEXT: ttng.init_barrier
36+
ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
37+
ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
38+
39+
// CHECK-NEXT: gpu.barrier
40+
// CHECK-NEXT: ttng.barrier_expect
41+
// CHECK-NEXT: ttng.async_tma_copy_global_to_local
42+
// CHECK-NEXT: ttng.wait_barrier
43+
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
44+
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
45+
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
46+
47+
// CHECK-NEXT: ttng.async_tma_copy_global_to_local
48+
// CHECK-NEXT: ttng.barrier_expect
49+
// CHECK-NEXT: gpu.barrier
50+
// CHECK-NEXT: ttng.wait_barrier
51+
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
52+
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
53+
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
54+
55+
// CHECK-NEXT: ttg.local_load
56+
%t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
57+
58+
// CHECK-NEXT: ttng.barrier_expect
59+
// CHECK-NEXT: gpu.barrier
60+
// CHECK-NEXT: ttng.async_tma_copy_global_to_local
61+
// CHECK-NEXT: ttng.wait_barrier
62+
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
63+
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
64+
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
65+
66+
// CHECK-NEXT: memdesc_subview
67+
// CHECK-NEXT: ttng.barrier_expect
68+
// CHECK-NEXT: ttng.async_tma_gather
69+
// CHECK-NEXT: gpu.barrier
70+
// CHECK-NEXT: ttng.wait_barrier
71+
%view = ttg.memdesc_subview %alloc[%c0, %c0] : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>
72+
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
73+
ttng.async_tma_gather %arg1[%cx, %c0] %view, %barrier, %true : !tt.ptr<i8, 0>, tensor<32xi32>, i32, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>, !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>, i1
74+
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
75+
76+
// CHECK-NEXT: gpu.barrier
77+
// CHECK-NEXT: ttng.inval_barrier
78+
// CHECK-NEXT: ttng.inval_barrier
79+
ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
80+
ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
81+
82+
tt.return %t : tensor<256x64xf16, #blocked>
83+
}
84+
}
85+
86+
// -----
87+
88+
#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
89+
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
90+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
91+
92+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 18944 : i32} {
93+
// CHECK-LABEL: tma_special_cases_cf
94+
tt.func @tma_special_cases_cf(%arg1: !tt.ptr<i8, 0>, %i1 : i1, %arg2: tensor<256x64xf16, #blocked>) -> (tensor<256x64xf16, #blocked>){
95+
%true = arith.constant 1 : i1
96+
%c0 = arith.constant 0 : i32
97+
%barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
98+
%alloc = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
99+
// CF: cf.cond_br
100+
// SCF: scf.if
101+
scf.if %i1 {
102+
// CHECK-NOT: gpu.barrier
103+
// CHECK: ttng.async_tma_copy_global_to_local
104+
// CHECK-NEXT: ttng.barrier_expect
105+
// CHECK-NEXT: ttng.wait_barrier
106+
// CF-NEXT: cf.br
107+
// SCF-NEXT: } else {
108+
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
109+
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
110+
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
111+
} else {
112+
// CHECK-NOT: gpu.barrier
113+
// CHECK: ttg.local_store
114+
// CF-NEXT: cf.br
115+
// SCF-NEXT: }
116+
ttg.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
117+
}
118+
// CHECK: gpu.barrier
119+
// CHECK-NEXT: ttg.local_load
120+
%t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
121+
tt.return %t : tensor<256x64xf16, #blocked>
122+
}
123+
}
124+
125+
// -----
126+
127+
// CHECK-LABEL: tmem_copy_after_alloc
128+
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
129+
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
130+
#smem = #ttg.shared_memory
131+
#tmem_scales = #ttng.tensor_memory_scales_encoding<>
132+
module attributes {"ttg.num-warps" = 4 : i32} {
133+
tt.func @tmem_copy_after_alloc(%arg0: tensor<1x2048xf8E4M3FN, #blocked>) {
134+
// CHECK: local_alloc
135+
%0 = ttg.local_alloc %arg0 {allocation.offset = 53248 : i32} : (tensor<1x2048xf8E4M3FN, #blocked>) -> !ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>
136+
// CHECK: tmem_alloc
137+
%1 = ttng.tmem_alloc {tensor_memory_col_offset = 256 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>
138+
// gpu.barrier
139+
// CHECK: tmem_copy
140+
ttng.tmem_copy %0, %1, : (!ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>, !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>) -> ()
141+
tt.return
142+
}
143+
}

test/Analysis/test-membar.mlir

Lines changed: 1 addition & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
// RUN: triton-opt %s -split-input-file --convert-scf-to-cf --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=CF
2-
// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=SCF
3-
// RUN: triton-opt %s -split-input-file --convert-scf-to-cf --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=CF
4-
// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=SCF
1+
// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s
52

63
#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
74
#sliceAd0 = #ttg.slice<{dim = 0, parent = #AL}>
@@ -117,17 +114,6 @@ tt.func @async_wait(%arg: tensor<32x16xf16, #AL>) {
117114
tt.return
118115
}
119116

120-
// CHECK-LABEL: @async_store_wait
121-
tt.func @async_store_wait(%arg: tensor<32x16xf16, #AL>) {
122-
%alloc = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
123-
// CHECK: async_tma_store_wait
124-
ttng.async_tma_store_wait {pendings = 0 : i32}
125-
// CHECK-NEXT: gpu.barrier
126-
// CHECK-NEXT: ttg.local_store
127-
ttg.local_store %arg, %alloc : tensor<32x16xf16, #AL> -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable>
128-
tt.return
129-
}
130-
131117
// CHECK-LABEL: subview
132118
tt.func @subview() {
133119
%cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL>
@@ -743,111 +729,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
743729

744730
// -----
745731

746-
#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
747-
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
748-
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
749-
750-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 18944 : i32} {
751-
// CHECK-LABEL: tma_special_cases
752-
tt.func @tma_special_cases(%arg1: !tt.ptr<i8, 0>) -> (tensor<256x64xf16, #blocked>){
753-
%true = arith.constant 1 : i1
754-
%cx = arith.constant dense<1> : tensor<32xi32>
755-
%c0 = arith.constant 0 : i32
756-
%barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
757-
%alloc = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
758-
// CHECK: ttng.init_barrier
759-
// CHECK-NEXT: ttng.init_barrier
760-
ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
761-
ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
762-
763-
// CHECK-NEXT: gpu.barrier
764-
// CHECK-NEXT: ttng.barrier_expect
765-
// CHECK-NEXT: ttng.async_tma_copy_global_to_local
766-
// CHECK-NEXT: ttng.wait_barrier
767-
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
768-
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
769-
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
770-
771-
// CHECK-NEXT: ttng.async_tma_copy_global_to_local
772-
// CHECK-NEXT: ttng.barrier_expect
773-
// CHECK-NEXT: gpu.barrier
774-
// CHECK-NEXT: ttng.wait_barrier
775-
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
776-
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
777-
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
778-
779-
// CHECK-NEXT: ttg.local_load
780-
%t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
781-
782-
// CHECK-NEXT: ttng.barrier_expect
783-
// CHECK-NEXT: gpu.barrier
784-
// CHECK-NEXT: ttng.async_tma_copy_global_to_local
785-
// CHECK-NEXT: ttng.wait_barrier
786-
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
787-
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
788-
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
789-
790-
// CHECK-NEXT: memdesc_subview
791-
// CHECK-NEXT: ttng.barrier_expect
792-
// CHECK-NEXT: ttng.async_tma_gather
793-
// CHECK-NEXT: gpu.barrier
794-
// CHECK-NEXT: ttng.wait_barrier
795-
%view = ttg.memdesc_subview %alloc[%c0, %c0] : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>
796-
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
797-
ttng.async_tma_gather %arg1[%cx, %c0] %view, %barrier, %true : !tt.ptr<i8, 0>, tensor<32xi32>, i32, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>, !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>, i1
798-
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
799-
800-
// CHECK-NEXT: gpu.barrier
801-
// CHECK-NEXT: ttng.inval_barrier
802-
// CHECK-NEXT: ttng.inval_barrier
803-
ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
804-
ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
805-
806-
tt.return %t : tensor<256x64xf16, #blocked>
807-
}
808-
}
809-
810-
// -----
811-
812-
#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
813-
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
814-
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
815-
816-
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 18944 : i32} {
817-
// CHECK-LABEL: tma_special_cases_cf
818-
tt.func @tma_special_cases_cf(%arg1: !tt.ptr<i8, 0>, %i1 : i1, %arg2: tensor<256x64xf16, #blocked>) -> (tensor<256x64xf16, #blocked>){
819-
%true = arith.constant 1 : i1
820-
%c0 = arith.constant 0 : i32
821-
%barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
822-
%alloc = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
823-
// CF: cf.cond_br
824-
// SCF: scf.if
825-
scf.if %i1 {
826-
// CHECK-NOT: gpu.barrier
827-
// CHECK: ttng.async_tma_copy_global_to_local
828-
// CHECK-NEXT: ttng.barrier_expect
829-
// CHECK-NEXT: ttng.wait_barrier
830-
// CF-NEXT: cf.br
831-
// SCF-NEXT: } else {
832-
ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
833-
ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
834-
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
835-
} else {
836-
// CHECK-NOT: gpu.barrier
837-
// CHECK: ttg.local_store
838-
// CF-NEXT: cf.br
839-
// SCF-NEXT: }
840-
ttg.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
841-
}
842-
// CHECK: gpu.barrier
843-
// CHECK-NEXT: ttg.local_load
844-
%t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked>
845-
tt.return %t : tensor<256x64xf16, #blocked>
846-
}
847-
}
848-
849-
// -----
850-
851732
#layout = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
852733
#smem = #ttg.shared_memory
853734

@@ -983,26 +864,6 @@ tt.func @direct_backedge_within_loop(%arg0: index, %arg1: index, %arg2: index, %
983864

984865
// -----
985866

986-
// CHECK-LABEL: tmem_copy_after_alloc
987-
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
988-
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
989-
#smem = #ttg.shared_memory
990-
#tmem_scales = #ttng.tensor_memory_scales_encoding<>
991-
module attributes {"ttg.num-warps" = 4 : i32} {
992-
tt.func @tmem_copy_after_alloc(%arg0: tensor<1x2048xf8E4M3FN, #blocked>) {
993-
// CHECK: local_alloc
994-
%0 = ttg.local_alloc %arg0 {allocation.offset = 53248 : i32} : (tensor<1x2048xf8E4M3FN, #blocked>) -> !ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>
995-
// CHECK: tmem_alloc
996-
%1 = ttng.tmem_alloc {tensor_memory_col_offset = 256 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>
997-
// gpu.barrier
998-
// CHECK: tmem_copy
999-
ttng.tmem_copy %0, %1, : (!ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>, !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>) -> ()
1000-
tt.return
1001-
}
1002-
}
1003-
1004-
// -----
1005-
1006867
#shared = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
1007868

1008869
module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} {

0 commit comments

Comments
 (0)