109109 ret void
110110}
111111
112+ ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
113+ define amdgpu_kernel void @global_load_tr_b64_v2i32 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
114+ bb:
115+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
116+ %tmp0 = call <2 x i32 > @llvm.amdgcn.global.load.tr.v2i32 (ptr addrspace (1 ) %gep )
117+ store <2 x i32 > %tmp0 , ptr addrspace (1 ) %out , align 8
118+ ret void
119+ }
120+
121+ ; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep)
122+ define amdgpu_kernel void @global_load_tr_b128_v8i16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
123+ bb:
124+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
125+ %tmp0 = call <8 x i16 > @llvm.amdgcn.global.load.tr.v8i16 (ptr addrspace (1 ) %gep )
126+ store <8 x i16 > %tmp0 , ptr addrspace (1 ) %out , align 16
127+ ret void
128+ }
129+
130+ ; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep)
131+ define amdgpu_kernel void @global_load_tr_b128_v8f16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
132+ bb:
133+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
134+ %tmp0 = call <8 x half > @llvm.amdgcn.global.load.tr.v8f16 (ptr addrspace (1 ) %gep )
135+ store <8 x half > %tmp0 , ptr addrspace (1 ) %out , align 16
136+ ret void
137+ }
138+
139+ ; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep)
140+ define amdgpu_kernel void @global_load_tr_b128_v8bf16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
141+ bb:
142+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
143+ %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16 (ptr addrspace (1 ) %gep )
144+ store <8 x bfloat> %tmp0 , ptr addrspace (1 ) %out , align 16
145+ ret void
146+ }
147+
148+ ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep)
149+ define amdgpu_kernel void @global_load_tr_b64_i32 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
150+ bb:
151+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
152+ %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32 (ptr addrspace (1 ) %gep )
153+ store i32 %tmp0 , ptr addrspace (1 ) %out , align 4
154+ ret void
155+ }
156+
157+ ; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep)
158+ define amdgpu_kernel void @global_load_tr_b128_v4i16_ (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
159+ bb:
160+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
161+ %tmp0 = call <4 x i16 > @llvm.amdgcn.global.load.tr.v4i16 (ptr addrspace (1 ) %gep )
162+ store <4 x i16 > %tmp0 , ptr addrspace (1 ) %out , align 8
163+ ret void
164+ }
165+
166+ ; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep)
167+ define amdgpu_kernel void @global_load_tr_b128_v4f16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
168+ bb:
169+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
170+ %tmp0 = call <4 x half > @llvm.amdgcn.global.load.tr.v4f16 (ptr addrspace (1 ) %gep )
171+ store <4 x half > %tmp0 , ptr addrspace (1 ) %out , align 8
172+ ret void
173+ }
174+
175+ ; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep)
176+ define amdgpu_kernel void @global_load_tr_b128_v4bf16 (ptr addrspace (1 ) %addr , ptr addrspace (1 ) %out ) {
177+ bb:
178+ %gep = getelementptr i64 , ptr addrspace (1 ) %addr , i32 4
179+ %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16 (ptr addrspace (1 ) %gep )
180+ store <4 x bfloat> %tmp0 , ptr addrspace (1 ) %out , align 8
181+ ret void
182+ }
183+
112184declare i32 @llvm.amdgcn.ds.swizzle (i32 , i32 ) #1
113185declare i32 @llvm.amdgcn.permlane16 (i32 , i32 , i32 , i32 , i1 , i1 ) #1
114186declare i32 @llvm.amdgcn.permlanex16 (i32 , i32 , i32 , i32 , i1 , i1 ) #1
@@ -125,5 +197,14 @@ declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x
125197declare <8 x i32 > @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32 (i1 immarg, <4 x i32 >, i1 immarg, <4 x i32 > , <8 x i32 >, i1 immarg) #1
126198declare <8 x i32 > @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32 (i1 immarg, <2 x i32 >, i1 immarg, <2 x i32 > , <8 x i32 >, i1 immarg) #1
127199
200+ declare <2 x i32 > @llvm.amdgcn.global.load.tr.v2i32 (ptr addrspace (1 ))
201+ declare <8 x i16 > @llvm.amdgcn.global.load.tr.v8i16 (ptr addrspace (1 ))
202+ declare <8 x half > @llvm.amdgcn.global.load.tr.v8f16 (ptr addrspace (1 ))
203+ declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16 (ptr addrspace (1 ))
204+ declare i32 @llvm.amdgcn.global.load.tr.i32 (ptr addrspace (1 ))
205+ declare <4 x i16 > @llvm.amdgcn.global.load.tr.v4i16 (ptr addrspace (1 ))
206+ declare <4 x half > @llvm.amdgcn.global.load.tr.v4f16 (ptr addrspace (1 ))
207+ declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16 (ptr addrspace (1 ))
208+
128209attributes #0 = { nounwind convergent }
129210attributes #1 = { nounwind readnone convergent }
0 commit comments