1+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
12; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
23
34define void @ldg_f16 (ptr nocapture align 16 %rd0 ) {
5+ ; CHECK-LABEL: define void @ldg_f16(
6+ ; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
7+ ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[RD0]], align 16
8+ ; CHECK-NEXT: [[LOAD11:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
9+ ; CHECK-NEXT: [[LOAD22:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
10+ ; CHECK-NEXT: [[LOAD33:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
11+ ; CHECK-NEXT: [[LOAD44:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
12+ ; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <2 x half> [[LOAD11]], zeroinitializer
13+ ; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x half> [[LOAD11]], <2 x half> zeroinitializer
14+ ; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <2 x half> [[LOAD22]], zeroinitializer
15+ ; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x half> [[LOAD22]], <2 x half> zeroinitializer
16+ ; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <2 x half> [[LOAD33]], zeroinitializer
17+ ; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x half> [[LOAD33]], <2 x half> zeroinitializer
18+ ; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <2 x half> [[LOAD44]], zeroinitializer
19+ ; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x half> [[LOAD44]], <2 x half> zeroinitializer
20+ ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x half> [[S1]], i32 0
21+ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0
22+ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[S1]], i32 1
23+ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP4]], i32 1
24+ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x half> [[S2]], i32 0
25+ ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP5]], half [[TMP6]], i32 2
26+ ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x half> [[S2]], i32 1
27+ ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP7]], half [[TMP8]], i32 3
28+ ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x half> [[S3]], i32 0
29+ ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP9]], half [[TMP10]], i32 4
30+ ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[S3]], i32 1
31+ ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
32+ ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[S4]], i32 0
33+ ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
34+ ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x half> [[S4]], i32 1
35+ ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x half> [[TMP15]], half [[TMP16]], i32 7
36+ ; CHECK-NEXT: store <8 x half> [[TMP17]], ptr [[RD0]], align 16
37+ ; CHECK-NEXT: ret void
38+ ;
439 %load1 = load <2 x half >, ptr %rd0 , align 16
540 %p1 = fcmp ogt <2 x half > %load1 , zeroinitializer
641 %s1 = select <2 x i1 > %p1 , <2 x half > %load1 , <2 x half > zeroinitializer
@@ -22,20 +57,39 @@ define void @ldg_f16(ptr nocapture align 16 %rd0) {
2257 store <2 x half > %s4 , ptr %in4 , align 4
2358 ret void
2459
25- ; CHECK-LABEL: @ldg_f16
26- ; CHECK: %[[LD:.*]] = load <8 x half>, ptr
27- ; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 0, i32 1>
28- ; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 2, i32 3>
29- ; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 4, i32 5>
30- ; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 6, i32 7>
31- ; CHECK: store <8 x half>
3260}
3361
3462define void @no_nonpow2_vector (ptr nocapture align 16 %rd0 ) {
35- %load1 = load <3 x half >, ptr %rd0 , align 4
63+ ; CHECK-LABEL: define void @no_nonpow2_vector(
64+ ; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
65+ ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 16 [[RD0]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <8 x half> poison)
66+ ; CHECK-NEXT: [[LOAD13:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> <i32 0, i32 1, i32 2>
67+ ; CHECK-NEXT: [[LOAD24:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <3 x i32> <i32 3, i32 4, i32 5>
68+ ; CHECK-NEXT: [[EXTEND5:%.*]] = extractelement <8 x half> [[TMP1]], i32 6
69+ ; CHECK-NEXT: [[EXTEND26:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
70+ ; CHECK-NEXT: [[P1:%.*]] = fcmp ogt <3 x half> [[LOAD13]], zeroinitializer
71+ ; CHECK-NEXT: [[S1:%.*]] = select <3 x i1> [[P1]], <3 x half> [[LOAD13]], <3 x half> zeroinitializer
72+ ; CHECK-NEXT: store <3 x half> [[S1]], ptr [[RD0]], align 16
73+ ; CHECK-NEXT: [[IN2:%.*]] = getelementptr half, ptr [[RD0]], i64 3
74+ ; CHECK-NEXT: [[P2:%.*]] = fcmp ogt <3 x half> [[LOAD24]], zeroinitializer
75+ ; CHECK-NEXT: [[S2:%.*]] = select <3 x i1> [[P2]], <3 x half> [[LOAD24]], <3 x half> zeroinitializer
76+ ; CHECK-NEXT: store <3 x half> [[S2]], ptr [[IN2]], align 4
77+ ; CHECK-NEXT: [[IN3:%.*]] = getelementptr half, ptr [[RD0]], i64 6
78+ ; CHECK-NEXT: [[LOAD3:%.*]] = load <3 x half>, ptr [[IN3]], align 4
79+ ; CHECK-NEXT: [[P3:%.*]] = fcmp ogt <3 x half> [[LOAD3]], zeroinitializer
80+ ; CHECK-NEXT: [[S3:%.*]] = select <3 x i1> [[P3]], <3 x half> [[LOAD3]], <3 x half> zeroinitializer
81+ ; CHECK-NEXT: store <3 x half> [[S3]], ptr [[IN3]], align 4
82+ ; CHECK-NEXT: [[IN4:%.*]] = getelementptr half, ptr [[RD0]], i64 9
83+ ; CHECK-NEXT: [[LOAD4:%.*]] = load <3 x half>, ptr [[IN4]], align 4
84+ ; CHECK-NEXT: [[P4:%.*]] = fcmp ogt <3 x half> [[LOAD4]], zeroinitializer
85+ ; CHECK-NEXT: [[S4:%.*]] = select <3 x i1> [[P4]], <3 x half> [[LOAD4]], <3 x half> zeroinitializer
86+ ; CHECK-NEXT: store <3 x half> [[S4]], ptr [[IN4]], align 4
87+ ; CHECK-NEXT: ret void
88+ ;
89+ %load1 = load <3 x half >, ptr %rd0 , align 16
3690 %p1 = fcmp ogt <3 x half > %load1 , zeroinitializer
3791 %s1 = select <3 x i1 > %p1 , <3 x half > %load1 , <3 x half > zeroinitializer
38- store <3 x half > %s1 , ptr %rd0 , align 4
92+ store <3 x half > %s1 , ptr %rd0 , align 16
3993 %in2 = getelementptr half , ptr %rd0 , i64 3
4094 %load2 = load <3 x half >, ptr %in2 , align 4
4195 %p2 = fcmp ogt <3 x half > %load2 , zeroinitializer
@@ -52,16 +106,36 @@ define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) {
52106 %s4 = select <3 x i1 > %p4 , <3 x half > %load4 , <3 x half > zeroinitializer
53107 store <3 x half > %s4 , ptr %in4 , align 4
54108 ret void
55-
56- ; CHECK-LABEL: @no_nonpow2_vector
57- ; CHECK-NOT: shufflevector
58109}
59110
60111define void @no_pointer_vector (ptr nocapture align 16 %rd0 ) {
61- %load1 = load <2 x ptr >, ptr %rd0 , align 4
112+ ; CHECK-LABEL: define void @no_pointer_vector(
113+ ; CHECK-SAME: ptr align 16 captures(none) [[RD0:%.*]]) {
114+ ; CHECK-NEXT: [[LOAD1:%.*]] = load <2 x ptr>, ptr [[RD0]], align 16
115+ ; CHECK-NEXT: [[P1:%.*]] = icmp ne <2 x ptr> [[LOAD1]], zeroinitializer
116+ ; CHECK-NEXT: [[S1:%.*]] = select <2 x i1> [[P1]], <2 x ptr> [[LOAD1]], <2 x ptr> zeroinitializer
117+ ; CHECK-NEXT: store <2 x ptr> [[S1]], ptr [[RD0]], align 16
118+ ; CHECK-NEXT: [[IN2:%.*]] = getelementptr ptr, ptr [[RD0]], i64 2
119+ ; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x ptr>, ptr [[IN2]], align 4
120+ ; CHECK-NEXT: [[P2:%.*]] = icmp ne <2 x ptr> [[LOAD2]], zeroinitializer
121+ ; CHECK-NEXT: [[S2:%.*]] = select <2 x i1> [[P2]], <2 x ptr> [[LOAD2]], <2 x ptr> zeroinitializer
122+ ; CHECK-NEXT: store <2 x ptr> [[S2]], ptr [[IN2]], align 4
123+ ; CHECK-NEXT: [[IN3:%.*]] = getelementptr ptr, ptr [[RD0]], i64 4
124+ ; CHECK-NEXT: [[LOAD3:%.*]] = load <2 x ptr>, ptr [[IN3]], align 4
125+ ; CHECK-NEXT: [[P3:%.*]] = icmp ne <2 x ptr> [[LOAD3]], zeroinitializer
126+ ; CHECK-NEXT: [[S3:%.*]] = select <2 x i1> [[P3]], <2 x ptr> [[LOAD3]], <2 x ptr> zeroinitializer
127+ ; CHECK-NEXT: store <2 x ptr> [[S3]], ptr [[IN3]], align 4
128+ ; CHECK-NEXT: [[IN4:%.*]] = getelementptr ptr, ptr [[RD0]], i64 6
129+ ; CHECK-NEXT: [[LOAD4:%.*]] = load <2 x ptr>, ptr [[IN4]], align 4
130+ ; CHECK-NEXT: [[P4:%.*]] = icmp ne <2 x ptr> [[LOAD4]], zeroinitializer
131+ ; CHECK-NEXT: [[S4:%.*]] = select <2 x i1> [[P4]], <2 x ptr> [[LOAD4]], <2 x ptr> zeroinitializer
132+ ; CHECK-NEXT: store <2 x ptr> [[S4]], ptr [[IN4]], align 4
133+ ; CHECK-NEXT: ret void
134+ ;
135+ %load1 = load <2 x ptr >, ptr %rd0 , align 16
62136 %p1 = icmp ne <2 x ptr > %load1 , zeroinitializer
63137 %s1 = select <2 x i1 > %p1 , <2 x ptr > %load1 , <2 x ptr > zeroinitializer
64- store <2 x ptr > %s1 , ptr %rd0 , align 4
138+ store <2 x ptr > %s1 , ptr %rd0 , align 16
65139 %in2 = getelementptr ptr , ptr %rd0 , i64 2
66140 %load2 = load <2 x ptr >, ptr %in2 , align 4
67141 %p2 = icmp ne <2 x ptr > %load2 , zeroinitializer
@@ -78,7 +152,4 @@ define void @no_pointer_vector(ptr nocapture align 16 %rd0) {
78152 %s4 = select <2 x i1 > %p4 , <2 x ptr > %load4 , <2 x ptr > zeroinitializer
79153 store <2 x ptr > %s4 , ptr %in4 , align 4
80154 ret void
81-
82- ; CHECK-LABEL: @no_pointer_vector
83- ; CHECK-NOT: shufflevector
84155}
0 commit comments