@@ -250,150 +250,6 @@ bb2:
250250 store i32 0 , ptr addrspace (5 ) %extractelement
251251 ret void
252252}
253-
254- define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4 (ptr %buffer , float %data , i1 %idx_sel ) {
255- ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(
256- ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
257- ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
258- ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
259- ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
260- ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
261- ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
262- ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
263- ; CHECK-NEXT: ret void
264- ;
265- %alloca = alloca <3 x float >, align 16 , addrspace (5 )
266- %vec = load <3 x float >, ptr %buffer
267- store <3 x float > %vec , ptr addrspace (5 ) %alloca
268- %index = select i1 %idx_sel , i32 0 , i32 4
269- %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
270- store float %data , ptr addrspace (5 ) %elt , align 4
271- %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
272- store <3 x float > %updated , ptr %buffer , align 16
273- ret void
274- }
275-
276- define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8 (ptr %buffer , float %data , i1 %idx_sel ) {
277- ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
278- ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
279- ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
280- ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
281- ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
282- ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
283- ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
284- ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
285- ; CHECK-NEXT: ret void
286- ;
287- %alloca = alloca <3 x float >, align 16 , addrspace (5 )
288- %vec = load <3 x float >, ptr %buffer
289- store <3 x float > %vec , ptr addrspace (5 ) %alloca
290- %index = select i1 %idx_sel , i32 4 , i32 8
291- %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
292- store float %data , ptr addrspace (5 ) %elt , align 4
293- %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
294- store <3 x float > %updated , ptr %buffer , align 16
295- ret void
296- }
297-
298- define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8 (ptr %buffer , float %data , i1 %idx_sel ) {
299- ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8(
300- ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
301- ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x float> poison
302- ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
303- ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[VEC]], i64 0
304- ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[ALLOCA]], float [[TMP1]], i32 0
305- ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[VEC]], i64 1
306- ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP2]], float [[TMP3]], i32 1
307- ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x float> [[VEC]], i64 2
308- ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP4]], float [[TMP5]], i32 2
309- ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
310- ; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[INDEX]], 2
311- ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[DATA]], i32 [[TMP7]]
312- ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP8]], i32 0
313- ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x float> poison, float [[TMP9]], i64 0
314- ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[TMP8]], i32 1
315- ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <3 x float> [[TMP10]], float [[TMP11]], i64 1
316- ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[TMP8]], i32 2
317- ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <3 x float> [[TMP12]], float [[TMP13]], i64 2
318- ; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16
319- ; CHECK-NEXT: ret void
320- ;
321- %alloca = alloca [2 x <3 x float >], align 16 , addrspace (5 )
322- %row = getelementptr inbounds [2 x <3 x float >], ptr addrspace (5 ) %alloca , i32 0 , i32 0
323- %vec = load <3 x float >, ptr %buffer
324- store <3 x float > %vec , ptr addrspace (5 ) %row , align 16
325- %index = select i1 %idx_sel , i32 4 , i32 8
326- %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %row , i32 %index
327- store float %data , ptr addrspace (5 ) %elt , align 4
328- %updated = load <3 x float >, ptr addrspace (5 ) %row , align 16
329- store <3 x float > %updated , ptr %buffer , align 16
330- ret void
331- }
332-
333- define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote (ptr %buffer , float %data , i1 %idx_sel ) {
334- ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(
335- ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
336- ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
337- ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
338- ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
339- ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
340- ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
341- ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
342- ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
343- ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
344- ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
345- ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
346- ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
347- ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
348- ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
349- ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
350- ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]]
351- ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
352- ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16
353- ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
354- ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]]
355- ; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4
356- ; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16
357- ; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16
358- ; CHECK-NEXT: ret void
359- ;
360- %alloca = alloca <3 x float >, align 16 , addrspace (5 )
361- %vec = load <3 x float >, ptr %buffer
362- store <3 x float > %vec , ptr addrspace (5 ) %alloca
363- %index = select i1 %idx_sel , i32 4 , i32 5
364- %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
365- store float %data , ptr addrspace (5 ) %elt , align 4
366- %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
367- store <3 x float > %updated , ptr %buffer , align 16
368- ret void
369- }
370-
371- define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote (ptr %buffer , float %data , i1 %idx_sel ) {
372- ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(
373- ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
374- ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x float>], align 16, addrspace(5)
375- ; CHECK-NEXT: [[ROW:%.*]] = getelementptr inbounds [2 x <3 x float>], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
376- ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
377- ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(5) [[ROW]], align 16
378- ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
379- ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ROW]], i32 [[INDEX]]
380- ; CHECK-NEXT: store float [[DATA]], ptr addrspace(5) [[ELT]], align 4
381- ; CHECK-NEXT: [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[ROW]], align 16
382- ; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16
383- ; CHECK-NEXT: ret void
384- ;
385- %alloca = alloca [2 x <3 x float >], align 16 , addrspace (5 )
386- %row = getelementptr inbounds [2 x <3 x float >], ptr addrspace (5 ) %alloca , i32 0 , i32 0
387- %vec = load <3 x float >, ptr %buffer
388- store <3 x float > %vec , ptr addrspace (5 ) %row , align 16
389- %index = select i1 %idx_sel , i32 4 , i32 5
390- %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %row , i32 %index
391- store float %data , ptr addrspace (5 ) %elt , align 4
392- %updated = load <3 x float >, ptr addrspace (5 ) %row , align 16
393- store <3 x float > %updated , ptr %buffer , align 16
394- ret void
395- }
396-
397253;.
398254; CHECK: [[META0]] = !{}
399255; CHECK: [[RNG1]] = !{i32 0, i32 1025}
0 commit comments