1- ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
21; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
32; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
43
4+ ; CHECK-LABEL: @merge_v2i32_v2i32(
5+ ; CHECK: load <4 x i32>
6+ ; CHECK: store <4 x i32> zeroinitializer
57define amdgpu_kernel void @merge_v2i32_v2i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
6- ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32(
7- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
8- ; CHECK-NEXT: [[ENTRY:.*:]]
9- ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4
10- ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
11- ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
12- ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
13- ; CHECK-NEXT: ret void
14- ;
158entry:
169 %a.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %a , i64 1
1710 %b.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -25,16 +18,10 @@ entry:
2518 ret void
2619}
2720
21+ ; CHECK-LABEL: @merge_v1i32_v1i32(
22+ ; CHECK: load <2 x i32>
23+ ; CHECK: store <2 x i32> zeroinitializer
2824define amdgpu_kernel void @merge_v1i32_v1i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
29- ; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32(
30- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
31- ; CHECK-NEXT: [[ENTRY:.*:]]
32- ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4
33- ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer
34- ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> <i32 1>
35- ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
36- ; CHECK-NEXT: ret void
37- ;
3825entry:
3926 %a.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %a , i64 1
4027 %b.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -48,18 +35,12 @@ entry:
4835 ret void
4936}
5037
38+ ; CHECK-LABEL: @no_merge_v3i32_v3i32(
39+ ; CHECK: load <3 x i32>
40+ ; CHECK: load <3 x i32>
41+ ; CHECK: store <3 x i32> zeroinitializer
42+ ; CHECK: store <3 x i32> zeroinitializer
5143define amdgpu_kernel void @no_merge_v3i32_v3i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
52- ; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32(
53- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
54- ; CHECK-NEXT: [[ENTRY:.*:]]
55- ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1
56- ; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1
57- ; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4
58- ; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4
59- ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
60- ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4
61- ; CHECK-NEXT: ret void
62- ;
6344entry:
6445 %a.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %a , i64 1
6546 %b.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -73,16 +54,10 @@ entry:
7354 ret void
7455}
7556
57+ ; CHECK-LABEL: @merge_v2i16_v2i16(
58+ ; CHECK: load <4 x i16>
59+ ; CHECK: store <4 x i16> zeroinitializer
7660define amdgpu_kernel void @merge_v2i16_v2i16 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
77- ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16(
78- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
79- ; CHECK-NEXT: [[ENTRY:.*:]]
80- ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4
81- ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
82- ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
83- ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4
84- ; CHECK-NEXT: ret void
85- ;
8661entry:
8762 %a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %a , i64 1
8863 %b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %b , i64 1
@@ -96,27 +71,15 @@ entry:
9671 ret void
9772}
9873
74+ ; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
75+ ; CHECK-OOB-RELAXED: load <4 x i16>
76+ ; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
77+ ; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
78+ ; CHECK-OOB-STRICT: load <2 x i16>
79+ ; CHECK-OOB-STRICT: load <2 x i16>
80+ ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
81+ ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
9982define amdgpu_kernel void @merge_fat_ptrs (ptr addrspace (7 ) nocapture %a , ptr addrspace (7 ) nocapture readonly %b ) #0 {
100- ; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
101- ; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
102- ; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
103- ; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4
104- ; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
105- ; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
106- ; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
107- ; CHECK-OOB-RELAXED-NEXT: ret void
108- ;
109- ; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
110- ; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
111- ; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
112- ; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1
113- ; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1
114- ; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4
115- ; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4
116- ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
117- ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4
118- ; CHECK-OOB-STRICT-NEXT: ret void
119- ;
12083entry:
12184 %a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %a , i32 1
12285 %b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %b , i32 1
@@ -131,15 +94,10 @@ entry:
13194}
13295
13396; Ideally this would be merged
97+ ; CHECK-LABEL: @merge_load_i32_v2i16(
98+ ; CHECK: load i32,
99+ ; CHECK: load <2 x i16>
134100define amdgpu_kernel void @merge_load_i32_v2i16 (ptr addrspace (1 ) nocapture %a ) #0 {
135- ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
136- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
137- ; CHECK-NEXT: [[ENTRY:.*:]]
138- ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
139- ; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
140- ; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
141- ; CHECK-NEXT: ret void
142- ;
143101entry:
144102 %a.1 = getelementptr inbounds i32 , ptr addrspace (1 ) %a , i32 1
145103
0 commit comments