1+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
12; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
23; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
34
45target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
56
6- ; CHECK-LABEL: @merge_v2i32_v2i32(
7- ; CHECK: load <4 x i32>
8- ; CHECK: store <4 x i32> zeroinitializer
97define amdgpu_kernel void @merge_v2i32_v2i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
8+ ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32(
9+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
10+ ; CHECK-NEXT: [[ENTRY:.*:]]
11+ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4
12+ ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
13+ ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
14+ ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
15+ ; CHECK-NEXT: ret void
16+ ;
1017entry:
1118 %a.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %a , i64 1
1219 %b.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -20,10 +27,16 @@ entry:
2027 ret void
2128}
2229
23- ; CHECK-LABEL: @merge_v1i32_v1i32(
24- ; CHECK: load <2 x i32>
25- ; CHECK: store <2 x i32> zeroinitializer
2630define amdgpu_kernel void @merge_v1i32_v1i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
31+ ; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32(
32+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
33+ ; CHECK-NEXT: [[ENTRY:.*:]]
34+ ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4
35+ ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer
36+ ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> <i32 1>
37+ ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
38+ ; CHECK-NEXT: ret void
39+ ;
2740entry:
2841 %a.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %a , i64 1
2942 %b.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -37,12 +50,18 @@ entry:
3750 ret void
3851}
3952
40- ; CHECK-LABEL: @no_merge_v3i32_v3i32(
41- ; CHECK: load <3 x i32>
42- ; CHECK: load <3 x i32>
43- ; CHECK: store <3 x i32> zeroinitializer
44- ; CHECK: store <3 x i32> zeroinitializer
4553define amdgpu_kernel void @no_merge_v3i32_v3i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
54+ ; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32(
55+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
56+ ; CHECK-NEXT: [[ENTRY:.*:]]
57+ ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1
58+ ; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1
59+ ; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4
60+ ; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4
61+ ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
62+ ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4
63+ ; CHECK-NEXT: ret void
64+ ;
4665entry:
4766 %a.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %a , i64 1
4867 %b.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -56,10 +75,16 @@ entry:
5675 ret void
5776}
5877
59- ; CHECK-LABEL: @merge_v2i16_v2i16(
60- ; CHECK: load <4 x i16>
61- ; CHECK: store <4 x i16> zeroinitializer
6278define amdgpu_kernel void @merge_v2i16_v2i16 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
79+ ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16(
80+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
81+ ; CHECK-NEXT: [[ENTRY:.*:]]
82+ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4
83+ ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
84+ ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
85+ ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4
86+ ; CHECK-NEXT: ret void
87+ ;
6388entry:
6489 %a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %a , i64 1
6590 %b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %b , i64 1
@@ -73,15 +98,27 @@ entry:
7398 ret void
7499}
75100
76- ; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
77- ; CHECK-OOB-RELAXED: load <4 x i16>
78- ; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
79- ; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
80- ; CHECK-OOB-STRICT: load <2 x i16>
81- ; CHECK-OOB-STRICT: load <2 x i16>
82- ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
83- ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
84101define amdgpu_kernel void @merge_fat_ptrs (ptr addrspace (7 ) nocapture %a , ptr addrspace (7 ) nocapture readonly %b ) #0 {
102+ ; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
103+ ; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
104+ ; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
105+ ; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4
106+ ; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
107+ ; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
108+ ; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
109+ ; CHECK-OOB-RELAXED-NEXT: ret void
110+ ;
111+ ; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
112+ ; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
113+ ; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
114+ ; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1
115+ ; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1
116+ ; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4
117+ ; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4
118+ ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
119+ ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4
120+ ; CHECK-OOB-STRICT-NEXT: ret void
121+ ;
85122entry:
86123 %a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %a , i32 1
87124 %b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %b , i32 1
@@ -95,11 +132,16 @@ entry:
95132 ret void
96133}
97134
98- ; CHECK-LABEL: @merge_load_i32_v2i16(
99- ; CHECK: load <2 x i32>
100- ; CHECK: extractelement <2 x i32> %0, i32 0
101- ; CHECK: extractelement <2 x i32> %0, i32 1
102135define amdgpu_kernel void @merge_load_i32_v2i16 (ptr addrspace (1 ) nocapture %a ) #0 {
136+ ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
137+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
138+ ; CHECK-NEXT: [[ENTRY:.*:]]
139+ ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4
140+ ; CHECK-NEXT: [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
141+ ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
142+ ; CHECK-NEXT: [[DOTCAST:%.*]] = bitcast i32 [[TMP1]] to <2 x i16>
143+ ; CHECK-NEXT: ret void
144+ ;
103145entry:
104146 %a.1 = getelementptr inbounds i32 , ptr addrspace (1 ) %a , i32 1
105147
@@ -112,11 +154,56 @@ entry:
112154attributes #0 = { nounwind }
113155attributes #1 = { nounwind readnone }
114156
115- ; CHECK-LABEL: @merge_i32_2i16_float_4i8(
116- ; CHECK: load <4 x i32>
117- ; CHECK: store <2 x i32>
118- ; CHECK: store <2 x i32>
157+
119158define void @merge_i32_2i16_float_4i8 (ptr addrspace (1 ) %ptr1 , ptr addrspace (2 ) %ptr2 ) {
159+ ; CHECK-OOB-RELAXED-LABEL: define void @merge_i32_2i16_float_4i8(
160+ ; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1:[0-9]+]] {
161+ ; CHECK-OOB-RELAXED-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
162+ ; CHECK-OOB-RELAXED-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[GEP1]], align 4
163+ ; CHECK-OOB-RELAXED-NEXT: [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
164+ ; CHECK-OOB-RELAXED-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
165+ ; CHECK-OOB-RELAXED-NEXT: [[LOAD33:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
166+ ; CHECK-OOB-RELAXED-NEXT: [[TMP3:%.*]] = bitcast i32 [[LOAD33]] to float
167+ ; CHECK-OOB-RELAXED-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
168+ ; CHECK-OOB-RELAXED-NEXT: [[DOTCAST:%.*]] = bitcast i32 [[TMP2]] to <2 x i16>
169+ ; CHECK-OOB-RELAXED-NEXT: [[DOTCAST1:%.*]] = bitcast i32 [[TMP4]] to <4 x i8>
170+ ; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
171+ ; CHECK-OOB-RELAXED-NEXT: [[DOTCAST_CAST:%.*]] = bitcast <2 x i16> [[DOTCAST]] to i32
172+ ; CHECK-OOB-RELAXED-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD12]], i32 0
173+ ; CHECK-OOB-RELAXED-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[DOTCAST_CAST]], i32 1
174+ ; CHECK-OOB-RELAXED-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(2) [[STORE_GEP1]], align 4
175+ ; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
176+ ; CHECK-OOB-RELAXED-NEXT: [[DOTCAST1_CAST:%.*]] = bitcast <4 x i8> [[DOTCAST1]] to i32
177+ ; CHECK-OOB-RELAXED-NEXT: [[TMP7:%.*]] = bitcast float [[TMP3]] to i32
178+ ; CHECK-OOB-RELAXED-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
179+ ; CHECK-OOB-RELAXED-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[DOTCAST1_CAST]], i32 1
180+ ; CHECK-OOB-RELAXED-NEXT: store <2 x i32> [[TMP9]], ptr addrspace(2) [[STORE_GEP3]], align 4
181+ ; CHECK-OOB-RELAXED-NEXT: ret void
182+ ;
183+ ; CHECK-OOB-STRICT-LABEL: define void @merge_i32_2i16_float_4i8(
184+ ; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
185+ ; CHECK-OOB-STRICT-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0
186+ ; CHECK-OOB-STRICT-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[GEP1]], align 4
187+ ; CHECK-OOB-STRICT-NEXT: [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
188+ ; CHECK-OOB-STRICT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
189+ ; CHECK-OOB-STRICT-NEXT: [[LOAD33:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
190+ ; CHECK-OOB-STRICT-NEXT: [[TMP3:%.*]] = bitcast i32 [[LOAD33]] to float
191+ ; CHECK-OOB-STRICT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
192+ ; CHECK-OOB-STRICT-NEXT: [[DOTCAST:%.*]] = bitcast i32 [[TMP2]] to <2 x i16>
193+ ; CHECK-OOB-STRICT-NEXT: [[DOTCAST1:%.*]] = bitcast i32 [[TMP4]] to <4 x i8>
194+ ; CHECK-OOB-STRICT-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0
195+ ; CHECK-OOB-STRICT-NEXT: [[DOTCAST_CAST:%.*]] = bitcast <2 x i16> [[DOTCAST]] to i32
196+ ; CHECK-OOB-STRICT-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD12]], i32 0
197+ ; CHECK-OOB-STRICT-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[DOTCAST_CAST]], i32 1
198+ ; CHECK-OOB-STRICT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(2) [[STORE_GEP1]], align 4
199+ ; CHECK-OOB-STRICT-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2
200+ ; CHECK-OOB-STRICT-NEXT: [[DOTCAST1_CAST:%.*]] = bitcast <4 x i8> [[DOTCAST1]] to i32
201+ ; CHECK-OOB-STRICT-NEXT: [[TMP7:%.*]] = bitcast float [[TMP3]] to i32
202+ ; CHECK-OOB-STRICT-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
203+ ; CHECK-OOB-STRICT-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[DOTCAST1_CAST]], i32 1
204+ ; CHECK-OOB-STRICT-NEXT: store <2 x i32> [[TMP9]], ptr addrspace(2) [[STORE_GEP3]], align 4
205+ ; CHECK-OOB-STRICT-NEXT: ret void
206+ ;
120207 %gep1 = getelementptr inbounds i32 , ptr addrspace (1 ) %ptr1 , i64 0
121208 %load1 = load i32 , ptr addrspace (1 ) %gep1 , align 4
122209 %gep2 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %ptr1 , i64 1
@@ -136,10 +223,25 @@ define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %
136223 ret void
137224}
138225
139- ; CHECK-LABEL: @merge_fp_type(
140- ; CHECK: load <2 x float>
141- ; CHECK: bitcast float {{.*}} to <2 x half>
142226define void @merge_fp_type (ptr addrspace (1 ) %ptr1 , ptr addrspace (2 ) %ptr2 ) {
227+ ; CHECK-OOB-RELAXED-LABEL: define void @merge_fp_type(
228+ ; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] {
229+ ; CHECK-OOB-RELAXED-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
230+ ; CHECK-OOB-RELAXED-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[GEP1]], align 4
231+ ; CHECK-OOB-RELAXED-NEXT: [[LOAD11:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
232+ ; CHECK-OOB-RELAXED-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
233+ ; CHECK-OOB-RELAXED-NEXT: [[DOTCAST:%.*]] = bitcast float [[TMP2]] to <2 x half>
234+ ; CHECK-OOB-RELAXED-NEXT: ret void
235+ ;
236+ ; CHECK-OOB-STRICT-LABEL: define void @merge_fp_type(
237+ ; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) {
238+ ; CHECK-OOB-STRICT-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0
239+ ; CHECK-OOB-STRICT-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[GEP1]], align 4
240+ ; CHECK-OOB-STRICT-NEXT: [[LOAD11:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
241+ ; CHECK-OOB-STRICT-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
242+ ; CHECK-OOB-STRICT-NEXT: [[DOTCAST:%.*]] = bitcast float [[TMP2]] to <2 x half>
243+ ; CHECK-OOB-STRICT-NEXT: ret void
244+ ;
143245 %gep1 = getelementptr inbounds float , ptr addrspace (1 ) %ptr1 , i64 0
144246 %load1 = load float , ptr addrspace (1 ) %gep1 , align 4
145247 %gep2 = getelementptr inbounds <2 x half >, ptr addrspace (1 ) %ptr1 , i64 1
0 commit comments