Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
81b0534
[LoadStoreVectorizer] Fill gaps in loads/stores to enable vectorization
dakersnar Sep 17, 2025
b147e23
Clang format
dakersnar Sep 17, 2025
c6d98ba
Remove cl opts
dakersnar Sep 18, 2025
adeacac
Add context argument to TTI API
dakersnar Sep 18, 2025
47913a3
Update llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
dakersnar Sep 25, 2025
b6b87e7
Update tests to test for masked load generation in the LSV
dakersnar Oct 16, 2025
8854d5a
Remove isLegalToWidenLoads API
dakersnar Oct 16, 2025
a1d2827
Change LSV to create masked loads
dakersnar Oct 16, 2025
bb25df1
Update calls to TTI to match changes in lowering PR
dakersnar Oct 22, 2025
030c0bb
Update tests to match the new masked load/store syntax, moving alignm…
dakersnar Oct 22, 2025
0a0aa2e
Simplify pre-gap-filling TTI legality check
dakersnar Oct 24, 2025
20bf7ad
Clean up comments and simplify some logic
dakersnar Oct 24, 2025
8d0d2e9
More comment improvement
dakersnar Oct 24, 2025
cae6020
Add comment to clarify API usage
dakersnar Oct 27, 2025
4113e63
Address review feedback
dakersnar Nov 5, 2025
f02c6f8
Rework alignment deriving while gap filling
dakersnar Nov 5, 2025
8240ccb
Fix bug in alignment derive, update test to show improvement
dakersnar Nov 5, 2025
01dad11
Update tests to check for hex pragma
dakersnar Nov 5, 2025
6dc716d
Add more specific asserts, remove if condition
dakersnar Nov 5, 2025
7a05ee3
Update test to account for change in sub-byte element type legalizati…
dakersnar Nov 5, 2025
82b6fcd
Formatting
dakersnar Nov 5, 2025
12a7b5b
Merge remote-tracking branch 'github/main' into github/dkersnar/lsv-g…
dakersnar Nov 21, 2025
ccd5893
Fix formatting
dakersnar Nov 21, 2025
551f136
Add redundant element test with gap filling
dakersnar Nov 21, 2025
98d6f23
Merge branch 'main' into github/dkersnar/lsv-gap-fill
dakersnar Nov 26, 2025
b9fa06d
Update test to be auto generated
dakersnar Nov 26, 2025
97e8a10
Refactor to prevent extending chain too soon
dakersnar Dec 1, 2025
eb6df17
Fix alignment on test, update checks to generate masked load
dakersnar Dec 3, 2025
80b68fd
Merge branch 'main' into github/dkersnar/lsv-gap-fill
dakersnar Dec 3, 2025
37cef5b
Fix overeager alignment upgrading when vectorizer tries to upgrade al…
dakersnar Dec 3, 2025
af095c2
Fix test check
dakersnar Dec 3, 2025
0f91e88
Add new test, adjust comments
dakersnar Dec 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
476 changes: 407 additions & 69 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Large diffs are not rendered by default.

41 changes: 22 additions & 19 deletions llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,29 +45,32 @@ define half @fh(ptr %p) {
; ENABLED-LABEL: fh(
; ENABLED: {
; ENABLED-NEXT: .reg .b16 %rs<10>;
; ENABLED-NEXT: .reg .b32 %r<13>;
; ENABLED-NEXT: .reg .b32 %r<17>;
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2;
; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1;
; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3;
; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4;
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3;
; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6;
; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7;
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6;
; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9;
; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8;
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5;
; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12;
; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff";
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1;
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5;
; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4;
; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7;
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3;
; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2;
; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10;
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7;
; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6;
; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13;
; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8;
; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1;
; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16;
; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9;
; ENABLED-NEXT: ret;
;
Expand Down
84 changes: 84 additions & 0 deletions llvm/test/CodeGen/NVPTX/masked-load-3xhalf.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s
; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}

; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll
; where two 3xhalfs are chained together and extended to 8xhalf.
define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) {
; CHECK-LABEL: halfx3_extend_chain(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<7>;
; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0];
; CHECK-NEXT: .pragma "used_bytes_mask 0xfff";
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-NEXT: mov.b32 {_, %rs3}, %r2;
; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1};
; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4};
; CHECK-NEXT: mov.b32 %r7, 0;
; CHECK-NEXT: max.f16x2 %r8, %r2, %r7;
; CHECK-NEXT: max.f16x2 %r9, %r1, %r7;
; CHECK-NEXT: st.b32 [%rd1], %r9;
; CHECK-NEXT: mov.b32 {%rs5, _}, %r8;
; CHECK-NEXT: st.b16 [%rd1+4], %rs5;
; CHECK-NEXT: max.f16x2 %r10, %r6, %r7;
; CHECK-NEXT: max.f16x2 %r11, %r5, %r7;
; CHECK-NEXT: st.b32 [%rd1+6], %r11;
; CHECK-NEXT: mov.b32 {%rs6, _}, %r10;
; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
; CHECK-NEXT: ret;
%load1 = load <3 x half>, ptr %rd0, align 16
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
store <3 x half> %s1, ptr %rd0, align 16
%in2 = getelementptr half, ptr %rd0, i64 3
%load2 = load <3 x half>, ptr %in2, align 4
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
%s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
store <3 x half> %s2, ptr %in2, align 4
ret void
}

; This disables the vectorization by reducing the alignment.
define void @halfx3_no_align(ptr align 4 captures(none) %rd0) {
; CHECK-LABEL: halfx3_no_align(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<7>;
; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0];
; CHECK-NEXT: ld.b16 %rs1, [%rd1+4];
; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: ld.b32 %r2, [%rd1];
; CHECK-NEXT: mov.b32 %r3, 0;
; CHECK-NEXT: max.f16x2 %r4, %r1, %r3;
; CHECK-NEXT: max.f16x2 %r5, %r2, %r3;
; CHECK-NEXT: st.b32 [%rd1], %r5;
; CHECK-NEXT: mov.b32 {%rs3, _}, %r4;
; CHECK-NEXT: st.b16 [%rd1+4], %rs3;
; CHECK-NEXT: ld.b16 %rs4, [%rd1+10];
; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5};
; CHECK-NEXT: ld.b32 %r7, [%rd1+6];
; CHECK-NEXT: max.f16x2 %r8, %r6, %r3;
; CHECK-NEXT: max.f16x2 %r9, %r7, %r3;
; CHECK-NEXT: st.b32 [%rd1+6], %r9;
; CHECK-NEXT: mov.b32 {%rs6, _}, %r8;
; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
; CHECK-NEXT: ret;
%load1 = load <3 x half>, ptr %rd0, align 4
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
store <3 x half> %s1, ptr %rd0, align 4
%in2 = getelementptr half, ptr %rd0, i64 3
%load2 = load <3 x half>, ptr %in2, align 4
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
%s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
store <3 x half> %s2, ptr %in2, align 4
ret void
}
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
; CHECK: .pragma "used_bytes_mask 0xfff";
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
; CHECK-NEXT: ret;
Expand Down Expand Up @@ -394,8 +394,8 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
; CHECK: .pragma "used_bytes_mask 0xfff";
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]];
Expand Down
105 changes: 88 additions & 17 deletions llvm/test/Transforms/LoadStoreVectorizer/NVPTX/4x2xhalf.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s

define void @ldg_f16(ptr nocapture align 16 %rd0) {
; CHECK-LABEL: define void @ldg_f16(
; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[RD0]], align 16
; CHECK-NEXT: [[LOAD11:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[LOAD22:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: [[LOAD33:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
; CHECK-NEXT: [[LOAD44:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <2 x half> [[LOAD11]], zeroinitializer
; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x half> [[LOAD11]], <2 x half> zeroinitializer
; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <2 x half> [[LOAD22]], zeroinitializer
; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x half> [[LOAD22]], <2 x half> zeroinitializer
; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <2 x half> [[LOAD33]], zeroinitializer
; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x half> [[LOAD33]], <2 x half> zeroinitializer
; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <2 x half> [[LOAD44]], zeroinitializer
; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x half> [[LOAD44]], <2 x half> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[S1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[S1]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP4]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x half> [[S2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP5]], half [[TMP6]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x half> [[S2]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP7]], half [[TMP8]], i32 3
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x half> [[S3]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP9]], half [[TMP10]], i32 4
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[S3]], i32 1
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[S4]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x half> [[S4]], i32 1
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x half> [[TMP15]], half [[TMP16]], i32 7
; CHECK-NEXT: store <8 x half> [[TMP17]], ptr [[RD0]], align 16
; CHECK-NEXT: ret void
;
%load1 = load <2 x half>, ptr %rd0, align 16
%p1 = fcmp ogt <2 x half> %load1, zeroinitializer
%s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer
Expand All @@ -22,20 +57,39 @@ define void @ldg_f16(ptr nocapture align 16 %rd0) {
store <2 x half> %s4, ptr %in4, align 4
ret void

; CHECK-LABEL: @ldg_f16
; CHECK: %[[LD:.*]] = load <8 x half>, ptr
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
; CHECK: store <8 x half>
}

define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) {
%load1 = load <3 x half>, ptr %rd0, align 4
; CHECK-LABEL: define void @no_nonpow2_vector(
; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 16 [[RD0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x half> poison)
; CHECK-NEXT: [[LOAD13:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> <i32 0, i32 1, i32 2>
; CHECK-NEXT: [[LOAD24:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> <i32 3, i32 4, i32 5>
; CHECK-NEXT: [[EXTEND5:%.*]] = extractelement <8 x half> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTEND26:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <3 x half> [[LOAD13]], zeroinitializer
; CHECK-NEXT: [[S1:%.*]] = select <3 x i1> [[P1]], <3 x half> [[LOAD13]], <3 x half> zeroinitializer
; CHECK-NEXT: store <3 x half> [[S1]], ptr [[RD0]], align 16
; CHECK-NEXT: [[IN2:%.*]] = getelementptr half, ptr [[RD0]], i64 3
; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <3 x half> [[LOAD24]], zeroinitializer
; CHECK-NEXT: [[S2:%.*]] = select <3 x i1> [[P2]], <3 x half> [[LOAD24]], <3 x half> zeroinitializer
; CHECK-NEXT: store <3 x half> [[S2]], ptr [[IN2]], align 4
; CHECK-NEXT: [[IN3:%.*]] = getelementptr half, ptr [[RD0]], i64 6
; CHECK-NEXT: [[LOAD3:%.*]] = load <3 x half>, ptr [[IN3]], align 4
; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <3 x half> [[LOAD3]], zeroinitializer
; CHECK-NEXT: [[S3:%.*]] = select <3 x i1> [[P3]], <3 x half> [[LOAD3]], <3 x half> zeroinitializer
; CHECK-NEXT: store <3 x half> [[S3]], ptr [[IN3]], align 4
; CHECK-NEXT: [[IN4:%.*]] = getelementptr half, ptr [[RD0]], i64 9
; CHECK-NEXT: [[LOAD4:%.*]] = load <3 x half>, ptr [[IN4]], align 4
; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <3 x half> [[LOAD4]], zeroinitializer
; CHECK-NEXT: [[S4:%.*]] = select <3 x i1> [[P4]], <3 x half> [[LOAD4]], <3 x half> zeroinitializer
; CHECK-NEXT: store <3 x half> [[S4]], ptr [[IN4]], align 4
; CHECK-NEXT: ret void
;
%load1 = load <3 x half>, ptr %rd0, align 16
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
store <3 x half> %s1, ptr %rd0, align 4
store <3 x half> %s1, ptr %rd0, align 16
%in2 = getelementptr half, ptr %rd0, i64 3
%load2 = load <3 x half>, ptr %in2, align 4
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
Expand All @@ -52,16 +106,36 @@ define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) {
%s4 = select <3 x i1> %p4, <3 x half> %load4, <3 x half> zeroinitializer
store <3 x half> %s4, ptr %in4, align 4
ret void

; CHECK-LABEL: @no_nonpow2_vector
; CHECK-NOT: shufflevector
}

define void @no_pointer_vector(ptr nocapture align 16 %rd0) {
%load1 = load <2 x ptr>, ptr %rd0, align 4
; CHECK-LABEL: define void @no_pointer_vector(
; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
; CHECK-NEXT: [[LOAD1:%.*]] = load <2 x ptr>, ptr [[RD0]], align 16
; CHECK-NEXT: [[P1:%.*]] = icmp ne <2 x ptr> [[LOAD1]], zeroinitializer
; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x ptr> [[LOAD1]], <2 x ptr> zeroinitializer
; CHECK-NEXT: store <2 x ptr> [[S1]], ptr [[RD0]], align 16
; CHECK-NEXT: [[IN2:%.*]] = getelementptr ptr, ptr [[RD0]], i64 2
; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x ptr>, ptr [[IN2]], align 4
; CHECK-NEXT: [[P2:%.*]] = icmp ne <2 x ptr> [[LOAD2]], zeroinitializer
; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x ptr> [[LOAD2]], <2 x ptr> zeroinitializer
; CHECK-NEXT: store <2 x ptr> [[S2]], ptr [[IN2]], align 4
; CHECK-NEXT: [[IN3:%.*]] = getelementptr ptr, ptr [[RD0]], i64 4
; CHECK-NEXT: [[LOAD3:%.*]] = load <2 x ptr>, ptr [[IN3]], align 4
; CHECK-NEXT: [[P3:%.*]] = icmp ne <2 x ptr> [[LOAD3]], zeroinitializer
; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x ptr> [[LOAD3]], <2 x ptr> zeroinitializer
; CHECK-NEXT: store <2 x ptr> [[S3]], ptr [[IN3]], align 4
; CHECK-NEXT: [[IN4:%.*]] = getelementptr ptr, ptr [[RD0]], i64 6
; CHECK-NEXT: [[LOAD4:%.*]] = load <2 x ptr>, ptr [[IN4]], align 4
; CHECK-NEXT: [[P4:%.*]] = icmp ne <2 x ptr> [[LOAD4]], zeroinitializer
; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x ptr> [[LOAD4]], <2 x ptr> zeroinitializer
; CHECK-NEXT: store <2 x ptr> [[S4]], ptr [[IN4]], align 4
; CHECK-NEXT: ret void
;
%load1 = load <2 x ptr>, ptr %rd0, align 16
%p1 = icmp ne <2 x ptr> %load1, zeroinitializer
%s1 = select <2 x i1> %p1, <2 x ptr> %load1, <2 x ptr> zeroinitializer
store <2 x ptr> %s1, ptr %rd0, align 4
store <2 x ptr> %s1, ptr %rd0, align 16
%in2 = getelementptr ptr, ptr %rd0, i64 2
%load2 = load <2 x ptr>, ptr %in2, align 4
%p2 = icmp ne <2 x ptr> %load2, zeroinitializer
Expand All @@ -78,7 +152,4 @@ define void @no_pointer_vector(ptr nocapture align 16 %rd0) {
%s4 = select <2 x i1> %p4, <2 x ptr> %load4, <2 x ptr> zeroinitializer
store <2 x ptr> %s4, ptr %in4, align 4
ret void

; CHECK-LABEL: @no_pointer_vector
; CHECK-NOT: shufflevector
}
Loading
Loading