Skip to content

Commit 5db774a

Browse files
committed
[LV] Add additional test for narrowing to single scalars.
Add extra test coverage for narrowing stores to single scalars, with the store address being uniform-per-part, not uniform-across-all-parts. Test for #162498.
1 parent e5827e7 commit 5db774a

File tree

1 file changed

+134
-48
lines changed

1 file changed

+134
-48
lines changed
Lines changed: 134 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,59 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
2+
; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck --check-prefix=VF4IC1 %s
3+
; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF2IC2 %s
34

45
define void @narrow_select_to_single_scalar(i1 %invar.cond, ptr noalias %A, ptr noalias %B, ptr noalias %C) {
5-
; CHECK-LABEL: define void @narrow_select_to_single_scalar(
6-
; CHECK-SAME: i1 [[INVAR_COND:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
7-
; CHECK-NEXT: [[ENTRY:.*:]]
8-
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
9-
; CHECK: [[VECTOR_PH]]:
10-
; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[INVAR_COND]], i16 0, i16 1
11-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[C]], i16 [[TMP0]]
12-
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
13-
; CHECK: [[VECTOR_BODY]]:
14-
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
15-
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
16-
; CHECK-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 0
17-
; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 1
18-
; CHECK-NEXT: [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 2
19-
; CHECK-NEXT: [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 3
20-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i16 [[TMP5]]
21-
; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP6]], align 1
22-
; CHECK-NEXT: store i16 [[TMP7]], ptr [[B]], align 1
23-
; CHECK-NEXT: store i16 0, ptr [[TMP1]], align 1
24-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
25-
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
26-
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
27-
; CHECK: [[MIDDLE_BLOCK]]:
28-
; CHECK-NEXT: br label %[[SCALAR_PH:.*]]
29-
; CHECK: [[SCALAR_PH]]:
30-
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
31-
; CHECK: [[LOOP_HEADER]]:
32-
; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 1024, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_HEADER]] ]
33-
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i16 [[IV]]
34-
; CHECK-NEXT: [[L_0:%.*]] = load i16, ptr [[GEP_A]], align 1
35-
; CHECK-NEXT: store i16 [[L_0]], ptr [[B]], align 1
36-
; CHECK-NEXT: [[INVAR_SEL:%.*]] = select i1 [[INVAR_COND]], i16 0, i16 1
37-
; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr i16, ptr [[C]], i16 [[INVAR_SEL]]
38-
; CHECK-NEXT: store i16 0, ptr [[GEP_C]], align 1
39-
; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1
40-
; CHECK-NEXT: [[EC:%.*]] = icmp ne i16 [[IV]], 1024
41-
; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
42-
; CHECK: [[EXIT]]:
43-
; CHECK-NEXT: ret void
6+
; VF4IC1-LABEL: define void @narrow_select_to_single_scalar(
7+
; VF4IC1-SAME: i1 [[INVAR_COND:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
8+
; VF4IC1-NEXT: [[ENTRY:.*:]]
9+
; VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
10+
; VF4IC1: [[VECTOR_PH]]:
11+
; VF4IC1-NEXT: [[TMP0:%.*]] = select i1 [[INVAR_COND]], i16 0, i16 1
12+
; VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[C]], i16 [[TMP0]]
13+
; VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
14+
; VF4IC1: [[VECTOR_BODY]]:
15+
; VF4IC1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
16+
; VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
17+
; VF4IC1-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 0
18+
; VF4IC1-NEXT: [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 1
19+
; VF4IC1-NEXT: [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 2
20+
; VF4IC1-NEXT: [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 3
21+
; VF4IC1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i16 [[TMP5]]
22+
; VF4IC1-NEXT: [[TMP7:%.*]] = load i16, ptr [[TMP6]], align 1
23+
; VF4IC1-NEXT: store i16 [[TMP7]], ptr [[B]], align 1
24+
; VF4IC1-NEXT: store i16 0, ptr [[TMP1]], align 1
25+
; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
26+
; VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
27+
; VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
28+
; VF4IC1: [[MIDDLE_BLOCK]]:
29+
; VF4IC1-NEXT: br label %[[EXIT:.*]]
30+
; VF4IC1: [[EXIT]]:
31+
; VF4IC1-NEXT: ret void
32+
;
33+
; VF2IC2-LABEL: define void @narrow_select_to_single_scalar(
34+
; VF2IC2-SAME: i1 [[INVAR_COND:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
35+
; VF2IC2-NEXT: [[ENTRY:.*:]]
36+
; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]]
37+
; VF2IC2: [[VECTOR_PH]]:
38+
; VF2IC2-NEXT: [[TMP0:%.*]] = select i1 [[INVAR_COND]], i16 0, i16 1
39+
; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[C]], i16 [[TMP0]]
40+
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
41+
; VF2IC2: [[VECTOR_BODY]]:
42+
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
43+
; VF2IC2-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
44+
; VF2IC2-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 2
45+
; VF2IC2-NEXT: [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 3
46+
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i16 [[TMP3]]
47+
; VF2IC2-NEXT: [[TMP5:%.*]] = load i16, ptr [[TMP4]], align 1
48+
; VF2IC2-NEXT: store i16 [[TMP5]], ptr [[B]], align 1
49+
; VF2IC2-NEXT: store i16 0, ptr [[TMP1]], align 1
50+
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
51+
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
52+
; VF2IC2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
53+
; VF2IC2: [[MIDDLE_BLOCK]]:
54+
; VF2IC2-NEXT: br label %[[EXIT:.*]]
55+
; VF2IC2: [[EXIT]]:
56+
; VF2IC2-NEXT: ret void
4457
;
4558
entry:
4659
br label %loop.header
@@ -54,15 +67,88 @@ loop.header:
5467
%gep.C = getelementptr i16, ptr %C, i16 %invar.sel
5568
store i16 0, ptr %gep.C, align 1
5669
%iv.next = add i16 %iv, 1
57-
%ec = icmp ne i16 %iv, 1024
70+
%ec = icmp ne i16 %iv.next, 1024
5871
br i1 %ec, label %loop.header, label %exit
5972

6073
exit:
6174
ret void
6275
}
63-
;.
64-
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
65-
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
66-
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
67-
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
68-
;.
76+
77+
; FIXME: Currently this mis-compiled when interleaving; all stores store the
78+
; last lane of the last part, instead of the last lane per part.
79+
; Test case for https://github.com/llvm/llvm-project/issues/162498.
80+
define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) {
81+
; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
82+
; VF4IC1-SAME: ptr [[DST:%.*]]) {
83+
; VF4IC1-NEXT: [[ENTRY:.*:]]
84+
; VF4IC1-NEXT: br label %[[VECTOR_PH:.*]]
85+
; VF4IC1: [[VECTOR_PH]]:
86+
; VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]]
87+
; VF4IC1: [[VECTOR_BODY]]:
88+
; VF4IC1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
89+
; VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
90+
; VF4IC1-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
91+
; VF4IC1-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
92+
; VF4IC1-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
93+
; VF4IC1-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3
94+
; VF4IC1-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[VEC_IND]], splat (i32 1)
95+
; VF4IC1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
96+
; VF4IC1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP5]]
97+
; VF4IC1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
98+
; VF4IC1-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP7]]
99+
; VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
100+
; VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP9]]
101+
; VF4IC1-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
102+
; VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP11]]
103+
; VF4IC1-NEXT: store i32 [[TMP0]], ptr [[TMP6]], align 4
104+
; VF4IC1-NEXT: store i32 [[TMP1]], ptr [[TMP8]], align 4
105+
; VF4IC1-NEXT: store i32 [[TMP2]], ptr [[TMP10]], align 4
106+
; VF4IC1-NEXT: store i32 [[TMP3]], ptr [[TMP12]], align 4
107+
; VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
108+
; VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
109+
; VF4IC1-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
110+
; VF4IC1-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
111+
; VF4IC1: [[MIDDLE_BLOCK]]:
112+
; VF4IC1-NEXT: br label %[[EXIT:.*]]
113+
; VF4IC1: [[EXIT]]:
114+
; VF4IC1-NEXT: ret void
115+
;
116+
; VF2IC2-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
117+
; VF2IC2-SAME: ptr [[DST:%.*]]) {
118+
; VF2IC2-NEXT: [[ENTRY:.*:]]
119+
; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]]
120+
; VF2IC2: [[VECTOR_PH]]:
121+
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
122+
; VF2IC2: [[VECTOR_BODY]]:
123+
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
124+
; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 2
125+
; VF2IC2-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 3
126+
; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[INDEX]], 1
127+
; VF2IC2-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 1
128+
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
129+
; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]]
130+
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4
131+
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4
132+
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
133+
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
134+
; VF2IC2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
135+
; VF2IC2: [[MIDDLE_BLOCK]]:
136+
; VF2IC2-NEXT: br label %[[EXIT:.*]]
137+
; VF2IC2: [[EXIT]]:
138+
; VF2IC2-NEXT: ret void
139+
;
140+
entry:
141+
br label %loop
142+
143+
loop:
144+
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
145+
%iv.shift = lshr i32 %iv, 1
146+
%gep.dst = getelementptr i32, ptr %dst, i32 %iv.shift
147+
store i32 %iv, ptr %gep.dst, align 4
148+
%iv.next = add i32 %iv, 1
149+
%ec = icmp eq i32 %iv.next, 100
150+
br i1 %ec, label %exit, label %loop
151+
152+
exit:
153+
ret void
154+
}

0 commit comments

Comments
 (0)