Skip to content

Commit e2e2057

Browse files
davemgreentstellar
authored andcommitted
[ARM] Ensure loop invariant active.lane.mask operands
CGP can move instructions like a ptrtoint into a loop, but the MVETailPredication when converting them will currently assume invariant trip counts. This tries to ensure the operands are loop invariant, and bails if not. Differential Revision: https://reviews.llvm.org/D100550
1 parent 0f3fec4 commit e2e2057

File tree

2 files changed

+149
-0
lines changed

2 files changed

+149
-0
lines changed

llvm/lib/Target/ARM/MVETailPredication.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
205205
EnableTailPredication == TailPredication::ForceEnabled;
206206

207207
Value *ElemCount = ActiveLaneMask->getOperand(1);
208+
bool Changed = false;
209+
if (!L->makeLoopInvariant(ElemCount, Changed))
210+
return false;
211+
208212
auto *EC= SE->getSCEV(ElemCount);
209213
auto *TC = SE->getSCEV(TripCount);
210214
int VectorWidth =
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3+
4+
; This test has an instruction that gets sunk into the loop, that is a
5+
; active.lane.mask operand. (%exitcount.ptrcnt.to.int = ptrtoint). We
6+
; need to make sure it is loop invariant.
7+
8+
define i32 @a(i32* readnone %b, i8* %c) {
9+
; CHECK-LABEL: a:
10+
; CHECK: @ %bb.0: @ %entry
11+
; CHECK-NEXT: .save {r4, lr}
12+
; CHECK-NEXT: push {r4, lr}
13+
; CHECK-NEXT: cmp r0, r1
14+
; CHECK-NEXT: it ls
15+
; CHECK-NEXT: popls {r4, pc}
16+
; CHECK-NEXT: .LBB0_1: @ %while.body.preheader
17+
; CHECK-NEXT: subs r0, r0, r1
18+
; CHECK-NEXT: movs r3, #1
19+
; CHECK-NEXT: add.w r2, r0, #15
20+
; CHECK-NEXT: mov r12, r1
21+
; CHECK-NEXT: bic r2, r2, #15
22+
; CHECK-NEXT: subs r2, #16
23+
; CHECK-NEXT: add.w lr, r3, r2, lsr #4
24+
; CHECK-NEXT: movs r2, #0
25+
; CHECK-NEXT: dls lr, lr
26+
; CHECK-NEXT: .LBB0_2: @ %vector.body
27+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
28+
; CHECK-NEXT: adds r3, r1, r2
29+
; CHECK-NEXT: vctp.8 r0
30+
; CHECK-NEXT: vmov.8 q0[0], r3
31+
; CHECK-NEXT: adds r4, r3, #1
32+
; CHECK-NEXT: vmov.8 q0[1], r4
33+
; CHECK-NEXT: adds r4, r3, #2
34+
; CHECK-NEXT: vmov.8 q0[2], r4
35+
; CHECK-NEXT: adds r4, r3, #3
36+
; CHECK-NEXT: vmov.8 q0[3], r4
37+
; CHECK-NEXT: adds r4, r3, #4
38+
; CHECK-NEXT: vmov.8 q0[4], r4
39+
; CHECK-NEXT: adds r4, r3, #5
40+
; CHECK-NEXT: vmov.8 q0[5], r4
41+
; CHECK-NEXT: adds r4, r3, #6
42+
; CHECK-NEXT: vmov.8 q0[6], r4
43+
; CHECK-NEXT: adds r4, r3, #7
44+
; CHECK-NEXT: vmov.8 q0[7], r4
45+
; CHECK-NEXT: add.w r4, r3, #8
46+
; CHECK-NEXT: vmov.8 q0[8], r4
47+
; CHECK-NEXT: add.w r4, r3, #9
48+
; CHECK-NEXT: vmov.8 q0[9], r4
49+
; CHECK-NEXT: add.w r4, r3, #10
50+
; CHECK-NEXT: vmov.8 q0[10], r4
51+
; CHECK-NEXT: add.w r4, r3, #11
52+
; CHECK-NEXT: vmov.8 q0[11], r4
53+
; CHECK-NEXT: add.w r4, r3, #12
54+
; CHECK-NEXT: vmov.8 q0[12], r4
55+
; CHECK-NEXT: add.w r4, r3, #13
56+
; CHECK-NEXT: vmov.8 q0[13], r4
57+
; CHECK-NEXT: add.w r4, r3, #14
58+
; CHECK-NEXT: adds r2, #16
59+
; CHECK-NEXT: subs r0, #16
60+
; CHECK-NEXT: vmov.8 q0[14], r4
61+
; CHECK-NEXT: adds r3, #15
62+
; CHECK-NEXT: vmov.8 q0[15], r3
63+
; CHECK-NEXT: vpst
64+
; CHECK-NEXT: vstrbt.8 q0, [r12], #16
65+
; CHECK-NEXT: le lr, .LBB0_2
66+
; CHECK-NEXT: @ %bb.3: @ %while.end
67+
; CHECK-NEXT: pop {r4, pc}
68+
entry:
69+
%0 = bitcast i32* %b to i8*
70+
%cmp3 = icmp ugt i8* %0, %c
71+
br i1 %cmp3, label %while.body.preheader, label %while.end
72+
73+
while.body.preheader: ; preds = %entry
74+
%c5 = ptrtoint i8* %c to i32
75+
%1 = sub i32 0, %c5
76+
%uglygep = getelementptr i8, i8* %0, i32 %1
77+
%exitcount.ptrcnt.to.int = ptrtoint i8* %uglygep to i32
78+
%n.rnd.up = add i32 %exitcount.ptrcnt.to.int, 15
79+
%n.vec = and i32 %n.rnd.up, -16
80+
br label %vector.body
81+
82+
vector.body: ; preds = %vector.body, %while.body.preheader
83+
%index = phi i32 [ 0, %while.body.preheader ], [ %index.next, %vector.body ]
84+
%next.gep = getelementptr i8, i8* %c, i32 %index
85+
%2 = or i32 %index, 1
86+
%next.gep7 = getelementptr i8, i8* %c, i32 %2
87+
%3 = or i32 %index, 2
88+
%next.gep8 = getelementptr i8, i8* %c, i32 %3
89+
%4 = or i32 %index, 3
90+
%next.gep9 = getelementptr i8, i8* %c, i32 %4
91+
%5 = or i32 %index, 4
92+
%next.gep10 = getelementptr i8, i8* %c, i32 %5
93+
%6 = or i32 %index, 5
94+
%next.gep11 = getelementptr i8, i8* %c, i32 %6
95+
%7 = or i32 %index, 6
96+
%next.gep12 = getelementptr i8, i8* %c, i32 %7
97+
%8 = or i32 %index, 7
98+
%next.gep13 = getelementptr i8, i8* %c, i32 %8
99+
%9 = or i32 %index, 8
100+
%next.gep14 = getelementptr i8, i8* %c, i32 %9
101+
%10 = or i32 %index, 9
102+
%next.gep15 = getelementptr i8, i8* %c, i32 %10
103+
%11 = or i32 %index, 10
104+
%next.gep16 = getelementptr i8, i8* %c, i32 %11
105+
%12 = or i32 %index, 11
106+
%next.gep17 = getelementptr i8, i8* %c, i32 %12
107+
%13 = or i32 %index, 12
108+
%next.gep18 = getelementptr i8, i8* %c, i32 %13
109+
%14 = or i32 %index, 13
110+
%next.gep19 = getelementptr i8, i8* %c, i32 %14
111+
%15 = or i32 %index, 14
112+
%next.gep20 = getelementptr i8, i8* %c, i32 %15
113+
%16 = or i32 %index, 15
114+
%next.gep21 = getelementptr i8, i8* %c, i32 %16
115+
%17 = insertelement <16 x i8*> poison, i8* %next.gep, i32 0
116+
%18 = insertelement <16 x i8*> %17, i8* %next.gep7, i32 1
117+
%19 = insertelement <16 x i8*> %18, i8* %next.gep8, i32 2
118+
%20 = insertelement <16 x i8*> %19, i8* %next.gep9, i32 3
119+
%21 = insertelement <16 x i8*> %20, i8* %next.gep10, i32 4
120+
%22 = insertelement <16 x i8*> %21, i8* %next.gep11, i32 5
121+
%23 = insertelement <16 x i8*> %22, i8* %next.gep12, i32 6
122+
%24 = insertelement <16 x i8*> %23, i8* %next.gep13, i32 7
123+
%25 = insertelement <16 x i8*> %24, i8* %next.gep14, i32 8
124+
%26 = insertelement <16 x i8*> %25, i8* %next.gep15, i32 9
125+
%27 = insertelement <16 x i8*> %26, i8* %next.gep16, i32 10
126+
%28 = insertelement <16 x i8*> %27, i8* %next.gep17, i32 11
127+
%29 = insertelement <16 x i8*> %28, i8* %next.gep18, i32 12
128+
%30 = insertelement <16 x i8*> %29, i8* %next.gep19, i32 13
129+
%31 = insertelement <16 x i8*> %30, i8* %next.gep20, i32 14
130+
%32 = insertelement <16 x i8*> %31, i8* %next.gep21, i32 15
131+
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %exitcount.ptrcnt.to.int)
132+
%33 = ptrtoint <16 x i8*> %32 to <16 x i32>
133+
%34 = trunc <16 x i32> %33 to <16 x i8>
134+
%35 = bitcast i8* %next.gep to <16 x i8>*
135+
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %34, <16 x i8>* %35, i32 1, <16 x i1> %active.lane.mask)
136+
%index.next = add i32 %index, 16
137+
%36 = icmp eq i32 %index.next, %n.vec
138+
br i1 %36, label %while.end, label %vector.body
139+
140+
while.end: ; preds = %vector.body, %entry
141+
ret i32 undef
142+
}
143+
144+
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
145+
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)

0 commit comments

Comments
 (0)