1
1
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=COMMON,GFX950
2
2
// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=COMMON,GFX942
3
3
4
- // COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
5
- // COMMON: [[LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
4
+ // COMMON: [[$ ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
5
+ // COMMON: [[$ LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
6
6
#blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [64 , 1 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
7
7
#shared = #ttg.swizzled_shared <{vec = 8 , perPhase = 8 , maxPhase = 2 , order = [0 , 1 ]}>
8
8
#smem = #ttg.shared_memory
9
9
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , ttg.target = " hip:gfx950" , " ttg.threads-per-warp" = 64 : i32 } {
10
+ // COMMON-LABEL: @async_copy_alias
10
11
tt.func public @async_copy_alias (%arg0: !tt.ptr <f32 > {tt.divisibility = 16 : i32 , tt.pointer_range = 32 : i32 },
11
12
%arg1: !ttg.memdesc <64 x1 xf32 , #shared , #smem , mutable >,
12
13
%maskVal: i1 ) {
@@ -15,9 +16,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
15
16
%ptr = tt.splat %arg0 : !tt.ptr <f32 > -> tensor <64 x1 x!tt.ptr <f32 >, #blocked >
16
17
%mask = tt.splat %maskVal : i1 -> tensor <64 x1 xi1 , #blocked >
17
18
18
- // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[ASYNC_COPY_SCOPE]]]
19
+ // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
19
20
// Check that store for 'other' has alias information set
20
- // COMMON: llvm.store {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
21
+ // COMMON: llvm.store {{.*}} {alias_scopes = [[[$ LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
21
22
%0 = ttg.async_copy_global_to_local %ptr , %arg1 mask %mask other %other : tensor <64 x1 x!tt.ptr <f32 >, #blocked > -> <64 x1 xf32 , #shared , #smem , mutable >
22
23
23
24
// COMMON: llvm.return
@@ -27,21 +28,22 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
27
28
28
29
// -----
29
30
30
- // COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
31
+ // COMMON: [[$ ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
31
32
#blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 64 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ]}>
32
33
#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ]}>
33
34
#smem = #ttg.shared_memory
34
35
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , ttg.shared = 8192 : i32 , ttg.target = " hip:gfx942" , " ttg.threads-per-warp" = 64 : i32 } {
36
+ // COMMON-LABEL: @buffer_load_to_local_alias
35
37
tt.func public @buffer_load_to_local_alias (%maskVal: i1 ,
36
38
%arg1: !tt.ptr <f32 >,
37
39
%arg2: tensor <8 x64 xi32 , #blocked >,
38
40
%arg3: !ttg.memdesc <8 x64 xf32 , #shared , #smem , mutable >) {
39
41
%mask = tt.splat %maskVal : i1 -> tensor <8 x64 xi1 , #blocked >
40
42
%other = arith.constant dense <1.000000e+00 > : tensor <8 x64 xf32 , #blocked >
41
43
42
- // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}} {alias_scopes = [[[ASYNC_COPY_SCOPE]]]
44
+ // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}} {alias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
43
45
// Check that store for 'other' has alias information set
44
- // COMMON: llvm.store {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
46
+ // COMMON: llvm.store {{.*}} {alias_scopes = [[[$ LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
45
47
%65 = amdgpu.buffer_load_to_local %arg1 [%arg2 ] mask =%mask other =%other into %arg3 {OpIdx = #amdgpu.OpIdx <1 >} : <f32 >[tensor <8 x64 xi32 , #blocked >] tensor <8 x64 xf32 , #blocked > -> <8 x64 xf32 , #shared , #smem , mutable >
46
48
47
49
// COMMON: llvm.return
@@ -51,14 +53,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
51
53
52
54
// -----
53
55
54
- // COMMON: [[LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
55
- // COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
56
+ // COMMON: [[$ LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
57
+ // COMMON: [[$ ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
56
58
#blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [64 , 1 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
57
59
#shared = #ttg.swizzled_shared <{vec = 8 , perPhase = 8 , maxPhase = 2 , order = [0 , 1 ]}>
58
60
#shared1 = #ttg.swizzled_shared <{vec = 8 , perPhase = 1 , maxPhase = 16 , order = [1 , 0 ]}>
59
61
#smem = #ttg.shared_memory
60
62
#mma = #ttg.amd_mfma <{versionMajor = 4 , versionMinor = 0 , warpsPerCTA = [1 , 4 ], instrShape = [32 , 32 ], isTransposed = true }>
61
63
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , ttg.target = " hip:gfx950" , " ttg.threads-per-warp" = 64 : i32 } {
64
+ // COMMON-LABEL: @local_loads_with_token_from_async_wait
62
65
tt.func public @local_loads_with_token_from_async_wait (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 , tt.pointer_range = 32 : i32 },
63
66
%arg1: !ttg.memdesc <64 x1 xf16 , #shared , #smem , mutable >,
64
67
%arg2: !ttg.memdesc <16 x16 xf16 , #shared , #smem , mutable >) {
@@ -67,12 +70,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
67
70
// Check alias information is added for different lowering paths
68
71
69
72
// Test lowering path in common MemoryOpToLLVM pattern
70
- // COMMON: llvm.load {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
73
+ // COMMON: llvm.load {{.*}} {alias_scopes = [[[$ LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
71
74
%4 = ttg.local_load %arg1 token %3 : !ttg.memdesc <64 x1 xf16 , #shared , #smem , mutable > -> tensor <64 x1 xf16 , #blocked >
72
75
73
76
// Test lowering path in AMD's MemoryOpToLLVM pattern
74
- // GFX942: llvm.load {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
75
- // GFX950: rocdl.ds.read.tr16.b64 {{.*}} {alias_scopes = [[[LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[ASYNC_COPY_SCOPE]]]
77
+ // GFX942: llvm.load {{.*}} {alias_scopes = [[[$ LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
78
+ // GFX950: rocdl.ds.read.tr16.b64 {{.*}} {alias_scopes = [[[$ LOCAL_LOAD_SCOPE]]], noalias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
76
79
%5 = ttg.local_load %arg2 token %3 : !ttg.memdesc <16 x16 xf16 , #shared , #smem , mutable > -> tensor <16 x16 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #mma , kWidth = 8 }>>
77
80
78
81
// Stores to keep the local_loads
@@ -90,27 +93,28 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
90
93
91
94
// Same as above but LocalLoad does not use the token from AsyncWait
92
95
93
- // COMMON: [[ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
96
+ // COMMON: [[$ ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
94
97
#blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [64 , 1 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
95
98
#shared = #ttg.swizzled_shared <{vec = 8 , perPhase = 8 , maxPhase = 2 , order = [0 , 1 ]}>
96
99
#shared1 = #ttg.swizzled_shared <{vec = 8 , perPhase = 1 , maxPhase = 16 , order = [1 , 0 ]}>
97
100
#smem = #ttg.shared_memory
98
101
#mma = #ttg.amd_mfma <{versionMajor = 4 , versionMinor = 0 , warpsPerCTA = [1 , 4 ], instrShape = [32 , 32 ], isTransposed = true }>
99
102
module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , ttg.target = " hip:gfx950" , " ttg.threads-per-warp" = 64 : i32 } {
103
+ // COMMON-LABEL: @local_loads_without_token_from_async_wait
100
104
tt.func public @local_loads_without_token_from_async_wait (%arg0: !tt.ptr <f32 > {tt.divisibility = 16 : i32 , tt.pointer_range = 32 : i32 },
101
105
%arg1: !ttg.memdesc <64 x1 xf32 , #shared , #smem , mutable >,
102
106
%arg4: !ttg.memdesc <16 x16 xf32 , #shared , #smem , mutable >) {
103
107
// We need the splat to allow the AxisAnalysis to work during lowering
104
108
%ptr = tt.splat %arg0 : !tt.ptr <f32 > -> tensor <64 x1 x!tt.ptr <f32 >, #blocked >
105
109
106
- // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[ASYNC_COPY_SCOPE]]]
110
+ // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[$ ASYNC_COPY_SCOPE]]]
107
111
%0 = ttg.async_copy_global_to_local %ptr , %arg1 : tensor <64 x1 x!tt.ptr <f32 >, #blocked > -> <64 x1 xf32 , #shared , #smem , mutable >
108
112
%1 = ttg.async_commit_group %0
109
113
110
114
%3 = ttg.async_wait %1 {num = 1 : i32 }
111
115
112
116
// Check alias information is not used at all for different lowering paths
113
- // COMMON-NOT: [[ASYNC_COPY_SCOPE]]
117
+ // COMMON-NOT: [[$ ASYNC_COPY_SCOPE]]
114
118
115
119
// Test lowering path in common MemoryOpToLLVM pattern
116
120
%4 = ttg.local_load %arg1 token %0 : !ttg.memdesc <64 x1 xf32 , #shared , #smem , mutable > -> tensor <64 x1 xf32 , #blocked >
@@ -124,3 +128,40 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
124
128
tt.return
125
129
}
126
130
}
131
+
132
+ // -----
133
+
134
+ // COMMON: [[$LOCAL_LOAD_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.LocalLoads"
135
+ // COMMON: [[$ASYNC_COPY_SCOPE:#.*]] = #llvm.alias_scope<id = "amdgpu.AsyncCopies"
136
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [64 , 1 ], warpsPerCTA = [1 , 1 ], order = [0 , 1 ]}>
137
+ #shared = #ttg.swizzled_shared <{vec = 8 , perPhase = 8 , maxPhase = 2 , order = [0 , 1 ]}>
138
+ #shared1 = #ttg.swizzled_shared <{vec = 8 , perPhase = 1 , maxPhase = 16 , order = [1 , 0 ]}>
139
+ #smem = #ttg.shared_memory
140
+ #mma = #ttg.amd_mfma <{versionMajor = 4 , versionMinor = 0 , warpsPerCTA = [1 , 4 ], instrShape = [32 , 32 ], isTransposed = true }>
141
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , ttg.target = " hip:gfx950" , " ttg.threads-per-warp" = 64 : i32 } {
142
+ // COMMON-LABEL: @local_loads_with_loop_carried_token
143
+ tt.func public @local_loads_with_loop_carried_token (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 , tt.pointer_range = 32 : i32 },
144
+ %arg1: !ttg.memdesc <64 x1 xf16 , #shared , #smem , mutable >,
145
+ %loopIterCount: i32 ) {
146
+ %c0_i32 = arith.constant 0 : i32
147
+ %c1_i32 = arith.constant 1 : i32
148
+
149
+ %1 = ttg.async_wait {num = 1 : i32 }
150
+ // COMMON: llvm.load
151
+ %2 = ttg.local_load %arg1 token %1 : !ttg.memdesc <64 x1 xf16 , #shared , #smem , mutable > -> tensor <64 x1 xf16 , #blocked >
152
+
153
+ %loop_result:2 = scf.for %arg14 = %c0_i32 to %loopIterCount step %c1_i32 iter_args (%arg10 = %1 , %arg11 = %2 ) -> (!ttg.async.token , tensor <64 x1 xf16 , #blocked >) : i32 {
154
+ // COMMON: llvm.load {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
155
+ %3 = ttg.local_load %arg1 token %arg10 : !ttg.memdesc <64 x1 xf16 , #shared , #smem , mutable > -> tensor <64 x1 xf16 , #blocked >
156
+ %4 = ttg.async_wait {num = 1 : i32 }
157
+ scf.yield %4 , %3: !ttg.async.token , tensor <64 x1 xf16 , #blocked >
158
+ }
159
+
160
+ // Stores to keep the local_loads
161
+ %ptr = tt.splat %arg0 : !tt.ptr <f16 > -> tensor <64 x1 x!tt.ptr <f16 >, #blocked >
162
+ tt.store %ptr , %loop_result#1 : tensor <64 x1 x!tt.ptr <f16 >, #blocked >
163
+
164
+ // COMMON: llvm.return
165
+ tt.return
166
+ }
167
+ }
0 commit comments