@@ -250,6 +250,150 @@ bb2:
250250 store i32 0 , ptr addrspace (5 ) %extractelement
251251 ret void
252252}
253+
254+ define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4 (ptr %buffer , float %data , i1 %idx_sel ) {
255+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(
256+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
257+ ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
258+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
259+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
260+ ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
261+ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
262+ ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
263+ ; CHECK-NEXT: ret void
264+ ;
265+ %alloca = alloca <3 x float >, align 16 , addrspace (5 )
266+ %vec = load <3 x float >, ptr %buffer
267+ store <3 x float > %vec , ptr addrspace (5 ) %alloca
268+ %index = select i1 %idx_sel , i32 0 , i32 4
269+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
270+ store float %data , ptr addrspace (5 ) %elt , align 4
271+ %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
272+ store <3 x float > %updated , ptr %buffer , align 16
273+ ret void
274+ }
275+
276+ define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8 (ptr %buffer , float %data , i1 %idx_sel ) {
277+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
278+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
279+ ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
280+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
281+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
282+ ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
283+ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
284+ ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
285+ ; CHECK-NEXT: ret void
286+ ;
287+ %alloca = alloca <3 x float >, align 16 , addrspace (5 )
288+ %vec = load <3 x float >, ptr %buffer
289+ store <3 x float > %vec , ptr addrspace (5 ) %alloca
290+ %index = select i1 %idx_sel , i32 4 , i32 8
291+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
292+ store float %data , ptr addrspace (5 ) %elt , align 4
293+ %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
294+ store <3 x float > %updated , ptr %buffer , align 16
295+ ret void
296+ }
297+
298+ define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8 (ptr %buffer , float %data , i1 %idx_sel ) {
299+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8(
300+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
301+ ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x float> poison
302+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
303+ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[VEC]], i64 0
304+ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[ALLOCA]], float [[TMP1]], i32 0
305+ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[VEC]], i64 1
306+ ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP2]], float [[TMP3]], i32 1
307+ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x float> [[VEC]], i64 2
308+ ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP4]], float [[TMP5]], i32 2
309+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
310+ ; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[INDEX]], 2
311+ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[DATA]], i32 [[TMP7]]
312+ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP8]], i32 0
313+ ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x float> poison, float [[TMP9]], i64 0
314+ ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[TMP8]], i32 1
315+ ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <3 x float> [[TMP10]], float [[TMP11]], i64 1
316+ ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[TMP8]], i32 2
317+ ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <3 x float> [[TMP12]], float [[TMP13]], i64 2
318+ ; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16
319+ ; CHECK-NEXT: ret void
320+ ;
321+ %alloca = alloca [2 x <3 x float >], align 16 , addrspace (5 )
322+ %row = getelementptr inbounds [2 x <3 x float >], ptr addrspace (5 ) %alloca , i32 0 , i32 0
323+ %vec = load <3 x float >, ptr %buffer
324+ store <3 x float > %vec , ptr addrspace (5 ) %row , align 16
325+ %index = select i1 %idx_sel , i32 4 , i32 8
326+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %row , i32 %index
327+ store float %data , ptr addrspace (5 ) %elt , align 4
328+ %updated = load <3 x float >, ptr addrspace (5 ) %row , align 16
329+ store <3 x float > %updated , ptr %buffer , align 16
330+ ret void
331+ }
332+
333+ define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote (ptr %buffer , float %data , i1 %idx_sel ) {
334+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(
335+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
336+ ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
337+ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
338+ ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
339+ ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
340+ ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
341+ ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
342+ ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
343+ ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
344+ ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
345+ ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
346+ ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
347+ ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
348+ ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
349+ ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
350+ ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]]
351+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
352+ ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16
353+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
354+ ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]]
355+ ; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4
356+ ; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16
357+ ; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16
358+ ; CHECK-NEXT: ret void
359+ ;
360+ %alloca = alloca <3 x float >, align 16 , addrspace (5 )
361+ %vec = load <3 x float >, ptr %buffer
362+ store <3 x float > %vec , ptr addrspace (5 ) %alloca
363+ %index = select i1 %idx_sel , i32 4 , i32 5
364+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
365+ store float %data , ptr addrspace (5 ) %elt , align 4
366+ %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
367+ store <3 x float > %updated , ptr %buffer , align 16
368+ ret void
369+ }
370+
371+ define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote (ptr %buffer , float %data , i1 %idx_sel ) {
372+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(
373+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
374+ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x float>], align 16, addrspace(5)
375+ ; CHECK-NEXT: [[ROW:%.*]] = getelementptr inbounds [2 x <3 x float>], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
376+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
377+ ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(5) [[ROW]], align 16
378+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
379+ ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ROW]], i32 [[INDEX]]
380+ ; CHECK-NEXT: store float [[DATA]], ptr addrspace(5) [[ELT]], align 4
381+ ; CHECK-NEXT: [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[ROW]], align 16
382+ ; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16
383+ ; CHECK-NEXT: ret void
384+ ;
385+ %alloca = alloca [2 x <3 x float >], align 16 , addrspace (5 )
386+ %row = getelementptr inbounds [2 x <3 x float >], ptr addrspace (5 ) %alloca , i32 0 , i32 0
387+ %vec = load <3 x float >, ptr %buffer
388+ store <3 x float > %vec , ptr addrspace (5 ) %row , align 16
389+ %index = select i1 %idx_sel , i32 4 , i32 5
390+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %row , i32 %index
391+ store float %data , ptr addrspace (5 ) %elt , align 4
392+ %updated = load <3 x float >, ptr addrspace (5 ) %row , align 16
393+ store <3 x float > %updated , ptr %buffer , align 16
394+ ret void
395+ }
396+
253397;.
254398; CHECK: [[META0]] = !{}
255399; CHECK: [[RNG1]] = !{i32 0, i32 1025}
0 commit comments