-
Notifications
You must be signed in to change notification settings - Fork 15k
Open
Description
I think LoopUnrollAndJam does something wrong when it checks the dependency between vector loads and stores.
Reproducer:
opt input.ll -S -passes=loop-unroll-and-jam -enable-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=2
input.ll:
define void @badjam(ptr %A) {
entry:
br label %outer
outer:
%i = phi i16 [ 0, %entry ], [ %add.i, %outer2 ]
%add.i = add nuw nsw i16 %i, 1
br label %inner
inner:
%index = phi i16 [ 0, %outer ], [ %index.next, %inner ]
%offset.idx = or disjoint i16 %index, 1
%0 = getelementptr inbounds [10 x [10 x i16]], ptr %A, i16 0, i16 %i, i16 %offset.idx
%wide.load = load <2 x i16>, ptr %0, align 1
%1 = getelementptr inbounds [10 x [10 x i16]], ptr %A, i16 0, i16 %add.i, i16 %index
store <2 x i16> %wide.load, ptr %1, align 1
%index.next = add nuw i16 %index, 2
%2 = icmp eq i16 %index.next, 8
br i1 %2, label %outer2, label %inner
outer2:
%exitcond.not = icmp eq i16 %add.i, 9
br i1 %exitcond.not, label %exit, label %outer
exit:
ret void
}
Originally from C:
#pragma unroll_and_jam(2)
for (i = 0; i < 9; i++) { // unroll-and-jam miscompile
#pragma clang loop unroll(disable) vectorize_width(2)
for (j = 1; j < 9; j++) {
A[i + 1][j - 1] = A[i][j];
}
}
When this is unroll-and-jammed, results in row i=2 are copied from row i=1 too early.