1+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
12; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
23; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
34
4- ; CHECK-LABEL: @merge_v2i32_v2i32(
5- ; CHECK: load <4 x i32>
6- ; CHECK: store <4 x i32> zeroinitializer
75define amdgpu_kernel void @merge_v2i32_v2i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
6+ ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32(
7+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
8+ ; CHECK-NEXT: [[ENTRY:.*:]]
9+ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4
10+ ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
11+ ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
12+ ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
13+ ; CHECK-NEXT: ret void
14+ ;
815entry:
916 %a.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %a , i64 1
1017 %b.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -18,10 +25,16 @@ entry:
1825 ret void
1926}
2027
21- ; CHECK-LABEL: @merge_v1i32_v1i32(
22- ; CHECK: load <2 x i32>
23- ; CHECK: store <2 x i32> zeroinitializer
2428define amdgpu_kernel void @merge_v1i32_v1i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
29+ ; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32(
30+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
31+ ; CHECK-NEXT: [[ENTRY:.*:]]
32+ ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4
33+ ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer
34+ ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> <i32 1>
35+ ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
36+ ; CHECK-NEXT: ret void
37+ ;
2538entry:
2639 %a.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %a , i64 1
2740 %b.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -35,12 +48,18 @@ entry:
3548 ret void
3649}
3750
38- ; CHECK-LABEL: @no_merge_v3i32_v3i32(
39- ; CHECK: load <3 x i32>
40- ; CHECK: load <3 x i32>
41- ; CHECK: store <3 x i32> zeroinitializer
42- ; CHECK: store <3 x i32> zeroinitializer
4351define amdgpu_kernel void @no_merge_v3i32_v3i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
52+ ; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32(
53+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
54+ ; CHECK-NEXT: [[ENTRY:.*:]]
55+ ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1
56+ ; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1
57+ ; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4
58+ ; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4
59+ ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
60+ ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4
61+ ; CHECK-NEXT: ret void
62+ ;
4463entry:
4564 %a.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %a , i64 1
4665 %b.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -54,10 +73,16 @@ entry:
5473 ret void
5574}
5675
57- ; CHECK-LABEL: @merge_v2i16_v2i16(
58- ; CHECK: load <4 x i16>
59- ; CHECK: store <4 x i16> zeroinitializer
6076define amdgpu_kernel void @merge_v2i16_v2i16 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
77+ ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16(
78+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
79+ ; CHECK-NEXT: [[ENTRY:.*:]]
80+ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4
81+ ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
82+ ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
83+ ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4
84+ ; CHECK-NEXT: ret void
85+ ;
6186entry:
6287 %a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %a , i64 1
6388 %b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %b , i64 1
@@ -71,15 +96,27 @@ entry:
7196 ret void
7297}
7398
74- ; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
75- ; CHECK-OOB-RELAXED: load <4 x i16>
76- ; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
77- ; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
78- ; CHECK-OOB-STRICT: load <2 x i16>
79- ; CHECK-OOB-STRICT: load <2 x i16>
80- ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
81- ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
8299define amdgpu_kernel void @merge_fat_ptrs (ptr addrspace (7 ) nocapture %a , ptr addrspace (7 ) nocapture readonly %b ) #0 {
100+ ; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
101+ ; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
102+ ; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
103+ ; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4
104+ ; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
105+ ; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
106+ ; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
107+ ; CHECK-OOB-RELAXED-NEXT: ret void
108+ ;
109+ ; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
110+ ; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
111+ ; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
112+ ; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1
113+ ; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1
114+ ; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4
115+ ; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4
116+ ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
117+ ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4
118+ ; CHECK-OOB-STRICT-NEXT: ret void
119+ ;
83120entry:
84121 %a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %a , i32 1
85122 %b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %b , i32 1
@@ -94,10 +131,15 @@ entry:
94131}
95132
96133; Ideally this would be merged
97- ; CHECK-LABEL: @merge_load_i32_v2i16(
98- ; CHECK: load i32,
99- ; CHECK: load <2 x i16>
100134define amdgpu_kernel void @merge_load_i32_v2i16 (ptr addrspace (1 ) nocapture %a ) #0 {
135+ ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
136+ ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
137+ ; CHECK-NEXT: [[ENTRY:.*:]]
138+ ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
139+ ; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
140+ ; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
141+ ; CHECK-NEXT: ret void
142+ ;
101143entry:
102144 %a.1 = getelementptr inbounds i32 , ptr addrspace (1 ) %a , i32 1
103145
0 commit comments