-
Notifications
You must be signed in to change notification settings - Fork 15.4k
Labels
Description
Flang can't vectorize the loop in s243 of TSVC while Clang can vectorize the loop written in C.
! Fortran version
module mod
integer ld, nloops
parameter (ld=1000,nloops=135)
real a(ld), b(ld), c(ld), d(ld), e(ld)
real aa(ld,ld), bb(ld,ld), cc(ld,ld)
interface
subroutine dummy(ld,n,a,b,c,d,e,aa,bb,cc,x)
integer ld, n
real a(ld), b(ld), c(ld), d(ld), e(ld)
real aa(ld,ld), bb(ld,ld), cc(ld,ld)
real, value :: x
end subroutine
end interface
end module
subroutine s243 (n)
use mod
integer n, i
call init(ld,n,a,b,c,d,e,aa,bb,cc,'s243 ')
do 10 i = 1,n-1
a(i) = b(i) + c(i) * d(i)
b(i) = a(i) + d(i) * e(i)
a(i) = b(i) + a(i+1) * d(i)
10 continue
call dummy(ld,n,a,b,c,d,e,aa,bb,cc,1.)
end// C version
#define LEN 32000
#define LEN2 256
float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2];
int s243() {
init( "s243 ");
for (int i = 0; i < LEN-1; i++) {
a[i] = b[i] + c[i ] * d[i];
b[i] = a[i] + d[i ] * e[i];
a[i] = b[i] + a[i+1] * d[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
return 0;
}$ flang-new -v -Ofast s243.f -S -Rpass=vector
flang-new version 18.0.0git (https://github.com/llvm/llvm-project.git 0e93d04001e45f39cabf0ffb5093512a7f622cc0)
Target: aarch64-unknown-linux-gnu
Thread model: posix
InstalledDir: /path/to/install/bin
Found candidate GCC installation: /path/to/lib/gcc/aarch64-unknown-linux-gnu/11.2.0
Selected GCC installation: /path/to/lib/gcc/aarch64-unknown-linux-gnu/11.2.0
Candidate multilib: .;@m64
Selected multilib: .;@m64
"/path/to/install/bin/flang-new" -fc1 -triple aarch64-unknown-linux-gnu -S -fcolor-diagnostics -mrelocation-model pic -pic-level 2 -pic-is-pie -ffast-math -target-cpu generic -target-feature +neon -target-feature +v8a -fstack-arrays -fversion-loops-for-stride -Rpass=vector -O3 -o s243.s -x f95-cpp-input s243.f
$ clang -Ofast s243.c -S -Rpass=vector
s243.c:15:3: remark: vectorized loop (vectorization width: 4, interleaved count: 1) [-Rpass=loop-vectorize]
15 | for (int i = 0; i < LEN-1; i++) {
| ^The Fortran code isn't vectorized because DSE doesn't remove the first store to a(i).
DSE uses BasicAA, and BasicAA can't say that a(i) and a(i+1) don't alias each other. So DSE doesn't work.
Actually, Flang generates LLVM IR like the following C code that makes alias analysis harder.
(FYI: BasicAA avoids complicated analyses which can affect the compilation time.)
for (int i = 1; i <= n-1; i++) {
a[i-1] = b[i-1] + c[i-1] * d[i-1];
b[i-1] = a[i-1] + d[i-1] * e[i-1];
a[i-1] = b[i-1] + a[i] * d[i-1];
}Conversely, the following Fortran code is easy to be vectorized.
do 10 i = 0,n-2
a(i+1) = b(i+1) + c(i+1) * d(i+1)
b(i+1) = a(i+1) + d(i+1) * e(i+1)
a(i+1) = b(i+1) + a(i+2) * d(i+1)
10 continue$ flang-new -Ofast s243_2.f -S -Rpass=vector
path/to/s243_2.f:14:7: remark: vectorized loop (vectorization width: 4, interleaved count: 1) [-Rpass=loop-vectorize]