Skip to content

Commit 5c8c7f3

Browse files
dakersnararsenm
andauthored
[LoadStoreVectorizer] Fill gaps in load/store chains to enable vectorization (#159388)
This change introduces Gap Filling, an optimization that aims to fill in holes in otherwise contiguous load/store chains to enable vectorization. It also introduces Chain Extending, which extends the end of a chain to the closest power of 2. This was originally motivated by the NVPTX target, but I tried to generalize it to be universally applicable to all targets that may use the LSV. I'm more than willing to make adjustments to improve the target-agnostic-ness of this change. I fully expect there are some issues and encourage feedback on how to improve things. For both loads and stores we only perform the optimization when we can generate a legal llvm masked load/store intrinsic, masking off the "extra" elements. Determining legality for stores is a little tricky from the NVPTX side, because these intrinsics are only supported for 256-bit vectors. See the other PR I opened for the implementation of the NVPTX lowering of masked store intrinsics, which include NVPTX TTI changes that return true for isLegalMaskedStore under certain conditions: #159387. This change is dependent on that backend change, but I predict this change will require more discussion, so I am putting them both up at the same time. The backend change will be merged first assuming both are approved. Edited: both stores _and loads_ must use masked intrinsics for this optimization to be legal. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent 89bc5ff commit 5c8c7f3

File tree

14 files changed

+2029
-152
lines changed

14 files changed

+2029
-152
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 407 additions & 69 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,29 +45,32 @@ define half @fh(ptr %p) {
4545
; ENABLED-LABEL: fh(
4646
; ENABLED: {
4747
; ENABLED-NEXT: .reg .b16 %rs<10>;
48-
; ENABLED-NEXT: .reg .b32 %r<13>;
48+
; ENABLED-NEXT: .reg .b32 %r<17>;
4949
; ENABLED-NEXT: .reg .b64 %rd<2>;
5050
; ENABLED-EMPTY:
5151
; ENABLED-NEXT: // %bb.0:
5252
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
53-
; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
54-
; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
55-
; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2;
56-
; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1;
57-
; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1;
58-
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3;
59-
; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4;
60-
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3;
61-
; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4;
62-
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6;
63-
; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7;
64-
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6;
65-
; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7;
66-
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9;
67-
; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8;
68-
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5;
69-
; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11;
70-
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12;
53+
; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff";
54+
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
55+
; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
56+
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
57+
; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1;
58+
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5;
59+
; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4;
60+
; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5;
61+
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7;
62+
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3;
63+
; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2;
64+
; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8;
65+
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10;
66+
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7;
67+
; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6;
68+
; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11;
69+
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13;
70+
; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8;
71+
; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1;
72+
; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15;
73+
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16;
7174
; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9;
7275
; ENABLED-NEXT: ret;
7376
;
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s
3+
; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
4+
5+
; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll
6+
; where two 3xhalfs are chained together and extended to 8xhalf.
7+
define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) {
8+
; CHECK-LABEL: halfx3_extend_chain(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b16 %rs<7>;
11+
; CHECK-NEXT: .reg .b32 %r<12>;
12+
; CHECK-NEXT: .reg .b64 %rd<2>;
13+
; CHECK-EMPTY:
14+
; CHECK-NEXT: // %bb.0:
15+
; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0];
16+
; CHECK-NEXT: .pragma "used_bytes_mask 0xfff";
17+
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
18+
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
19+
; CHECK-NEXT: mov.b32 {_, %rs3}, %r2;
20+
; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1};
21+
; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4};
22+
; CHECK-NEXT: mov.b32 %r7, 0;
23+
; CHECK-NEXT: max.f16x2 %r8, %r2, %r7;
24+
; CHECK-NEXT: max.f16x2 %r9, %r1, %r7;
25+
; CHECK-NEXT: st.b32 [%rd1], %r9;
26+
; CHECK-NEXT: mov.b32 {%rs5, _}, %r8;
27+
; CHECK-NEXT: st.b16 [%rd1+4], %rs5;
28+
; CHECK-NEXT: max.f16x2 %r10, %r6, %r7;
29+
; CHECK-NEXT: max.f16x2 %r11, %r5, %r7;
30+
; CHECK-NEXT: st.b32 [%rd1+6], %r11;
31+
; CHECK-NEXT: mov.b32 {%rs6, _}, %r10;
32+
; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
33+
; CHECK-NEXT: ret;
34+
%load1 = load <3 x half>, ptr %rd0, align 16
35+
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
36+
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
37+
store <3 x half> %s1, ptr %rd0, align 16
38+
%in2 = getelementptr half, ptr %rd0, i64 3
39+
%load2 = load <3 x half>, ptr %in2, align 4
40+
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
41+
%s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
42+
store <3 x half> %s2, ptr %in2, align 4
43+
ret void
44+
}
45+
46+
; This disables the vectorization by reducing the alignment.
47+
define void @halfx3_no_align(ptr align 4 captures(none) %rd0) {
48+
; CHECK-LABEL: halfx3_no_align(
49+
; CHECK: {
50+
; CHECK-NEXT: .reg .b16 %rs<7>;
51+
; CHECK-NEXT: .reg .b32 %r<10>;
52+
; CHECK-NEXT: .reg .b64 %rd<2>;
53+
; CHECK-EMPTY:
54+
; CHECK-NEXT: // %bb.0:
55+
; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0];
56+
; CHECK-NEXT: ld.b16 %rs1, [%rd1+4];
57+
; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
58+
; CHECK-NEXT: ld.b32 %r2, [%rd1];
59+
; CHECK-NEXT: mov.b32 %r3, 0;
60+
; CHECK-NEXT: max.f16x2 %r4, %r1, %r3;
61+
; CHECK-NEXT: max.f16x2 %r5, %r2, %r3;
62+
; CHECK-NEXT: st.b32 [%rd1], %r5;
63+
; CHECK-NEXT: mov.b32 {%rs3, _}, %r4;
64+
; CHECK-NEXT: st.b16 [%rd1+4], %rs3;
65+
; CHECK-NEXT: ld.b16 %rs4, [%rd1+10];
66+
; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5};
67+
; CHECK-NEXT: ld.b32 %r7, [%rd1+6];
68+
; CHECK-NEXT: max.f16x2 %r8, %r6, %r3;
69+
; CHECK-NEXT: max.f16x2 %r9, %r7, %r3;
70+
; CHECK-NEXT: st.b32 [%rd1+6], %r9;
71+
; CHECK-NEXT: mov.b32 {%rs6, _}, %r8;
72+
; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
73+
; CHECK-NEXT: ret;
74+
%load1 = load <3 x half>, ptr %rd0, align 4
75+
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
76+
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
77+
store <3 x half> %s1, ptr %rd0, align 4
78+
%in2 = getelementptr half, ptr %rd0, i64 3
79+
%load2 = load <3 x half>, ptr %in2, align 4
80+
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
81+
%s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
82+
store <3 x half> %s2, ptr %in2, align 4
83+
ret void
84+
}

llvm/test/CodeGen/NVPTX/param-vectorize-device.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,8 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
171171
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
172172
; CHECK-LABEL: callee_St4x3(
173173
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
174-
; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
175-
; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
174+
; CHECK: .pragma "used_bytes_mask 0xfff";
175+
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
176176
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
177177
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
178178
; CHECK-NEXT: ret;
@@ -394,8 +394,8 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
394394
; CHECK-LABEL: callee_St4x7(
395395
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
396396
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
397-
; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
398-
; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
397+
; CHECK: .pragma "used_bytes_mask 0xfff";
398+
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
399399
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
400400
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
401401
; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]];

llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll

Lines changed: 88 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
12
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
23

34
define void @ldg_f16(ptr nocapture align 16 %rd0) {
5+
; CHECK-LABEL: define void @ldg_f16(
6+
; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
7+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[RD0]], align 16
8+
; CHECK-NEXT: [[LOAD11:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
9+
; CHECK-NEXT: [[LOAD22:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
10+
; CHECK-NEXT: [[LOAD33:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
11+
; CHECK-NEXT: [[LOAD44:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
12+
; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <2 x half> [[LOAD11]], zeroinitializer
13+
; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x half> [[LOAD11]], <2 x half> zeroinitializer
14+
; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <2 x half> [[LOAD22]], zeroinitializer
15+
; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x half> [[LOAD22]], <2 x half> zeroinitializer
16+
; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <2 x half> [[LOAD33]], zeroinitializer
17+
; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x half> [[LOAD33]], <2 x half> zeroinitializer
18+
; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <2 x half> [[LOAD44]], zeroinitializer
19+
; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x half> [[LOAD44]], <2 x half> zeroinitializer
20+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[S1]], i32 0
21+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
22+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[S1]], i32 1
23+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP4]], i32 1
24+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x half> [[S2]], i32 0
25+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP5]], half [[TMP6]], i32 2
26+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x half> [[S2]], i32 1
27+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP7]], half [[TMP8]], i32 3
28+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x half> [[S3]], i32 0
29+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP9]], half [[TMP10]], i32 4
30+
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[S3]], i32 1
31+
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
32+
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[S4]], i32 0
33+
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
34+
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x half> [[S4]], i32 1
35+
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x half> [[TMP15]], half [[TMP16]], i32 7
36+
; CHECK-NEXT: store <8 x half> [[TMP17]], ptr [[RD0]], align 16
37+
; CHECK-NEXT: ret void
38+
;
439
%load1 = load <2 x half>, ptr %rd0, align 16
540
%p1 = fcmp ogt <2 x half> %load1, zeroinitializer
641
%s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer
@@ -22,20 +57,39 @@ define void @ldg_f16(ptr nocapture align 16 %rd0) {
2257
store <2 x half> %s4, ptr %in4, align 4
2358
ret void
2459

25-
; CHECK-LABEL: @ldg_f16
26-
; CHECK: %[[LD:.*]] = load <8 x half>, ptr
27-
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
28-
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
29-
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
30-
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
31-
; CHECK: store <8 x half>
3260
}
3361

3462
define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) {
35-
%load1 = load <3 x half>, ptr %rd0, align 4
63+
; CHECK-LABEL: define void @no_nonpow2_vector(
64+
; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
65+
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 16 [[RD0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x half> poison)
66+
; CHECK-NEXT: [[LOAD13:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> <i32 0, i32 1, i32 2>
67+
; CHECK-NEXT: [[LOAD24:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> <i32 3, i32 4, i32 5>
68+
; CHECK-NEXT: [[EXTEND5:%.*]] = extractelement <8 x half> [[TMP1]], i32 6
69+
; CHECK-NEXT: [[EXTEND26:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
70+
; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <3 x half> [[LOAD13]], zeroinitializer
71+
; CHECK-NEXT: [[S1:%.*]] = select <3 x i1> [[P1]], <3 x half> [[LOAD13]], <3 x half> zeroinitializer
72+
; CHECK-NEXT: store <3 x half> [[S1]], ptr [[RD0]], align 16
73+
; CHECK-NEXT: [[IN2:%.*]] = getelementptr half, ptr [[RD0]], i64 3
74+
; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <3 x half> [[LOAD24]], zeroinitializer
75+
; CHECK-NEXT: [[S2:%.*]] = select <3 x i1> [[P2]], <3 x half> [[LOAD24]], <3 x half> zeroinitializer
76+
; CHECK-NEXT: store <3 x half> [[S2]], ptr [[IN2]], align 4
77+
; CHECK-NEXT: [[IN3:%.*]] = getelementptr half, ptr [[RD0]], i64 6
78+
; CHECK-NEXT: [[LOAD3:%.*]] = load <3 x half>, ptr [[IN3]], align 4
79+
; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <3 x half> [[LOAD3]], zeroinitializer
80+
; CHECK-NEXT: [[S3:%.*]] = select <3 x i1> [[P3]], <3 x half> [[LOAD3]], <3 x half> zeroinitializer
81+
; CHECK-NEXT: store <3 x half> [[S3]], ptr [[IN3]], align 4
82+
; CHECK-NEXT: [[IN4:%.*]] = getelementptr half, ptr [[RD0]], i64 9
83+
; CHECK-NEXT: [[LOAD4:%.*]] = load <3 x half>, ptr [[IN4]], align 4
84+
; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <3 x half> [[LOAD4]], zeroinitializer
85+
; CHECK-NEXT: [[S4:%.*]] = select <3 x i1> [[P4]], <3 x half> [[LOAD4]], <3 x half> zeroinitializer
86+
; CHECK-NEXT: store <3 x half> [[S4]], ptr [[IN4]], align 4
87+
; CHECK-NEXT: ret void
88+
;
89+
%load1 = load <3 x half>, ptr %rd0, align 16
3690
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
3791
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
38-
store <3 x half> %s1, ptr %rd0, align 4
92+
store <3 x half> %s1, ptr %rd0, align 16
3993
%in2 = getelementptr half, ptr %rd0, i64 3
4094
%load2 = load <3 x half>, ptr %in2, align 4
4195
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
@@ -52,16 +106,36 @@ define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) {
52106
%s4 = select <3 x i1> %p4, <3 x half> %load4, <3 x half> zeroinitializer
53107
store <3 x half> %s4, ptr %in4, align 4
54108
ret void
55-
56-
; CHECK-LABEL: @no_nonpow2_vector
57-
; CHECK-NOT: shufflevector
58109
}
59110

60111
define void @no_pointer_vector(ptr nocapture align 16 %rd0) {
61-
%load1 = load <2 x ptr>, ptr %rd0, align 4
112+
; CHECK-LABEL: define void @no_pointer_vector(
113+
; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
114+
; CHECK-NEXT: [[LOAD1:%.*]] = load <2 x ptr>, ptr [[RD0]], align 16
115+
; CHECK-NEXT: [[P1:%.*]] = icmp ne <2 x ptr> [[LOAD1]], zeroinitializer
116+
; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x ptr> [[LOAD1]], <2 x ptr> zeroinitializer
117+
; CHECK-NEXT: store <2 x ptr> [[S1]], ptr [[RD0]], align 16
118+
; CHECK-NEXT: [[IN2:%.*]] = getelementptr ptr, ptr [[RD0]], i64 2
119+
; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x ptr>, ptr [[IN2]], align 4
120+
; CHECK-NEXT: [[P2:%.*]] = icmp ne <2 x ptr> [[LOAD2]], zeroinitializer
121+
; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x ptr> [[LOAD2]], <2 x ptr> zeroinitializer
122+
; CHECK-NEXT: store <2 x ptr> [[S2]], ptr [[IN2]], align 4
123+
; CHECK-NEXT: [[IN3:%.*]] = getelementptr ptr, ptr [[RD0]], i64 4
124+
; CHECK-NEXT: [[LOAD3:%.*]] = load <2 x ptr>, ptr [[IN3]], align 4
125+
; CHECK-NEXT: [[P3:%.*]] = icmp ne <2 x ptr> [[LOAD3]], zeroinitializer
126+
; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x ptr> [[LOAD3]], <2 x ptr> zeroinitializer
127+
; CHECK-NEXT: store <2 x ptr> [[S3]], ptr [[IN3]], align 4
128+
; CHECK-NEXT: [[IN4:%.*]] = getelementptr ptr, ptr [[RD0]], i64 6
129+
; CHECK-NEXT: [[LOAD4:%.*]] = load <2 x ptr>, ptr [[IN4]], align 4
130+
; CHECK-NEXT: [[P4:%.*]] = icmp ne <2 x ptr> [[LOAD4]], zeroinitializer
131+
; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x ptr> [[LOAD4]], <2 x ptr> zeroinitializer
132+
; CHECK-NEXT: store <2 x ptr> [[S4]], ptr [[IN4]], align 4
133+
; CHECK-NEXT: ret void
134+
;
135+
%load1 = load <2 x ptr>, ptr %rd0, align 16
62136
%p1 = icmp ne <2 x ptr> %load1, zeroinitializer
63137
%s1 = select <2 x i1> %p1, <2 x ptr> %load1, <2 x ptr> zeroinitializer
64-
store <2 x ptr> %s1, ptr %rd0, align 4
138+
store <2 x ptr> %s1, ptr %rd0, align 16
65139
%in2 = getelementptr ptr, ptr %rd0, i64 2
66140
%load2 = load <2 x ptr>, ptr %in2, align 4
67141
%p2 = icmp ne <2 x ptr> %load2, zeroinitializer
@@ -78,7 +152,4 @@ define void @no_pointer_vector(ptr nocapture align 16 %rd0) {
78152
%s4 = select <2 x i1> %p4, <2 x ptr> %load4, <2 x ptr> zeroinitializer
79153
store <2 x ptr> %s4, ptr %in4, align 4
80154
ret void
81-
82-
; CHECK-LABEL: @no_pointer_vector
83-
; CHECK-NOT: shufflevector
84155
}

0 commit comments

Comments
 (0)