@@ -456,10 +456,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace
456456; OPT: load-store-loop:
457457; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
458458; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
459- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
459+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
460460; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
461- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
462- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
461+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
462+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
463463; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
464464; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
465465; OPT: memcpy-split:
@@ -479,10 +479,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace
479479; OPT: load-store-loop:
480480; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
481481; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
482- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
482+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
483483; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
484- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
485- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
484+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
485+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
486486; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
487487; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
488488; OPT: memcpy-split:
@@ -502,10 +502,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace
502502; OPT: load-store-loop:
503503; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
504504; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
505- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
505+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
506506; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
507- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
508- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
507+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
508+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
509509; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
510510; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
511511; OPT: memcpy-split:
@@ -525,10 +525,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace
525525; OPT: load-store-loop:
526526; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
527527; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
528- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
528+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
529529; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
530- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
531- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
530+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
531+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
532532; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
533533; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
534534; OPT: memcpy-split:
@@ -548,10 +548,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace
548548; OPT: load-store-loop:
549549; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
550550; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
551- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
551+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
552552; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
553- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
554- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
553+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
554+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
555555; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
556556; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
557557; OPT: memcpy-split:
@@ -575,10 +575,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace
575575; OPT: load-store-loop:
576576; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
577577; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
578- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
578+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
579579; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
580- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
581- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
580+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
581+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
582582; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
583583; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
584584; OPT: memcpy-split:
@@ -606,10 +606,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace
606606; OPT: load-store-loop:
607607; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
608608; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
609- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
609+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
610610; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
611- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
612- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
611+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
612+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
613613; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
614614; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
615615; OPT: memcpy-split:
@@ -633,10 +633,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace
633633; OPT: load-store-loop:
634634; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
635635; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
636- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
636+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
637637; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
638- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
639- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
638+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
639+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
640640; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
641641; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
642642; OPT: memcpy-split:
@@ -691,10 +691,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace
691691; OPT: load-store-loop:
692692; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
693693; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
694- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
694+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
695695; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
696- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
697- ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
696+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
697+ ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
698698; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
699699; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
700700; OPT: memcpy-split:
@@ -764,10 +764,10 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa
764764; OPT: load-store-loop:
765765; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
766766; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
767- ; OPT-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4
767+ ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
768768; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
769- ; OPT-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
770- ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 16
769+ ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
770+ ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
771771; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
772772; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
773773; OPT: memcpy-split:
@@ -1194,17 +1194,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1
11941194; MAX1024-NEXT: ret void
11951195;
11961196; ALL-LABEL: @memcpy_global_align4_global_align4_16(
1197- ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1198- ; ALL: load-store-loop:
1199- ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
1200- ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
1197+ ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
12011198; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
1202- ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
1199+ ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
12031200; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1204- ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 16
1205- ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
1206- ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1207- ; ALL: memcpy-split:
12081201; ALL-NEXT: ret void
12091202;
12101203 call void @llvm.memcpy.p1.p1.i64 (ptr addrspace (1 ) align 4 %dst , ptr addrspace (1 ) align 4 %src , i64 16 , i1 false )
0 commit comments