Skip to content

Commit 0f91e88

Browse files
committed
Add new test, adjust comments
1 parent af095c2 commit 0f91e88

File tree

2 files changed

+89
-3
lines changed

2 files changed

+89
-3
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -351,8 +351,10 @@ class Vectorizer {
351351
/// in the chain is the leader, and an instr touches distance 0 from itself.
352352
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
353353

354-
/// Is a load/store with this alignment allowed by TTI and at least as fast
355-
/// as an unvectorized load/store.
354+
/// Checks if a potential vector load/store with a given alignment is allowed
355+
/// and fast. Aligned accesses are always allowed and fast, while misaligned
356+
/// accesses depend on TTI checks to determine whether they can and should be
357+
/// vectorized or kept as element-wise accesses.
356358
bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment,
357359
unsigned VecElemBits) const;
358360

@@ -1909,7 +1911,7 @@ bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
19091911
if (Alignment.value() % SizeBytes == 0)
19101912
return true;
19111913

1912-
// Element-wise access *might* be faster than misaligned vector accesses.
1914+
// Ask TTI whether misaligned accesses are faster as vector or element-wise.
19131915
unsigned VectorizedSpeed = 0;
19141916
bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
19151917
F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s
3+
; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
4+
5+
; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll
6+
; where two 3xhalfs are chained together and extended to 8xhalf.
7+
define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) {
8+
; CHECK-LABEL: halfx3_extend_chain(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b16 %rs<7>;
11+
; CHECK-NEXT: .reg .b32 %r<12>;
12+
; CHECK-NEXT: .reg .b64 %rd<2>;
13+
; CHECK-EMPTY:
14+
; CHECK-NEXT: // %bb.0:
15+
; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0];
16+
; CHECK-NEXT: .pragma "used_bytes_mask 0xfff";
17+
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
18+
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
19+
; CHECK-NEXT: mov.b32 {_, %rs3}, %r2;
20+
; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1};
21+
; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4};
22+
; CHECK-NEXT: mov.b32 %r7, 0;
23+
; CHECK-NEXT: max.f16x2 %r8, %r2, %r7;
24+
; CHECK-NEXT: max.f16x2 %r9, %r1, %r7;
25+
; CHECK-NEXT: st.b32 [%rd1], %r9;
26+
; CHECK-NEXT: mov.b32 {%rs5, _}, %r8;
27+
; CHECK-NEXT: st.b16 [%rd1+4], %rs5;
28+
; CHECK-NEXT: max.f16x2 %r10, %r6, %r7;
29+
; CHECK-NEXT: max.f16x2 %r11, %r5, %r7;
30+
; CHECK-NEXT: st.b32 [%rd1+6], %r11;
31+
; CHECK-NEXT: mov.b32 {%rs6, _}, %r10;
32+
; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
33+
; CHECK-NEXT: ret;
34+
%load1 = load <3 x half>, ptr %rd0, align 16
35+
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
36+
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
37+
store <3 x half> %s1, ptr %rd0, align 16
38+
%in2 = getelementptr half, ptr %rd0, i64 3
39+
%load2 = load <3 x half>, ptr %in2, align 4
40+
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
41+
%s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
42+
store <3 x half> %s2, ptr %in2, align 4
43+
ret void
44+
}
45+
46+
; This disables the vectorization by reducing the alignment.
47+
define void @halfx3_no_align(ptr align 4 captures(none) %rd0) {
48+
; CHECK-LABEL: halfx3_no_align(
49+
; CHECK: {
50+
; CHECK-NEXT: .reg .b16 %rs<7>;
51+
; CHECK-NEXT: .reg .b32 %r<10>;
52+
; CHECK-NEXT: .reg .b64 %rd<2>;
53+
; CHECK-EMPTY:
54+
; CHECK-NEXT: // %bb.0:
55+
; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0];
56+
; CHECK-NEXT: ld.b16 %rs1, [%rd1+4];
57+
; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
58+
; CHECK-NEXT: ld.b32 %r2, [%rd1];
59+
; CHECK-NEXT: mov.b32 %r3, 0;
60+
; CHECK-NEXT: max.f16x2 %r4, %r1, %r3;
61+
; CHECK-NEXT: max.f16x2 %r5, %r2, %r3;
62+
; CHECK-NEXT: st.b32 [%rd1], %r5;
63+
; CHECK-NEXT: mov.b32 {%rs3, _}, %r4;
64+
; CHECK-NEXT: st.b16 [%rd1+4], %rs3;
65+
; CHECK-NEXT: ld.b16 %rs4, [%rd1+10];
66+
; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5};
67+
; CHECK-NEXT: ld.b32 %r7, [%rd1+6];
68+
; CHECK-NEXT: max.f16x2 %r8, %r6, %r3;
69+
; CHECK-NEXT: max.f16x2 %r9, %r7, %r3;
70+
; CHECK-NEXT: st.b32 [%rd1+6], %r9;
71+
; CHECK-NEXT: mov.b32 {%rs6, _}, %r8;
72+
; CHECK-NEXT: st.b16 [%rd1+10], %rs6;
73+
; CHECK-NEXT: ret;
74+
%load1 = load <3 x half>, ptr %rd0, align 4
75+
%p1 = fcmp ogt <3 x half> %load1, zeroinitializer
76+
%s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer
77+
store <3 x half> %s1, ptr %rd0, align 4
78+
%in2 = getelementptr half, ptr %rd0, i64 3
79+
%load2 = load <3 x half>, ptr %in2, align 4
80+
%p2 = fcmp ogt <3 x half> %load2, zeroinitializer
81+
%s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer
82+
store <3 x half> %s2, ptr %in2, align 4
83+
ret void
84+
}

0 commit comments

Comments
 (0)