Skip to content

Commit 81b41c9

Browse files
committed
Fix to lowering to clean up generated code slightly when reductions aren't vectorized.
1 parent ee89d70 commit 81b41c9

File tree

6 files changed

+88
-7
lines changed

6 files changed

+88
-7
lines changed

benchmark/benchmarkflops.jl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,3 +308,33 @@ function benchmark_aplusBc(sizes)
308308
br
309309
end
310310

311+
function benchmark_AplusAt(sizes)
312+
tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "LoopVectorization"]
313+
br = BenchmarkResult(tests, sizes)
314+
for (i,s) enumerate(sizes)
315+
A = rand(s,s); B = similar(A)
316+
n_gflop = 1e-9*s^2
317+
br[1,i] = n_gflop / @belapsed @. $B = $A + $A'
318+
baseB = copy(B)
319+
br[2,i] = n_gflop / @belapsed cAplusAt!($B, $A)
320+
@assert B baseB "Clang wrong?"
321+
br[3,i] = n_gflop / @belapsed fAplusAt!($B, $A)
322+
@assert B baseB "Fort wrong?"
323+
br[4,i] = n_gflop / @belapsed fAplusAtbuiltin!($B, $A)
324+
@assert B baseB "Fort-builtin wrong?"
325+
br[5,i] = n_gflop / @belapsed icAplusAt!($B, $A)
326+
@assert B baseB "icc wrong?"
327+
br[6,i] = n_gflop / @belapsed ifAplusAt!($B, $A)
328+
@assert B baseB "ifort wrong?"
329+
br[7,i] = n_gflop / @belapsed ifAplusAtbuiltin!($B, $A)
330+
@assert B baseB "ifort-builtin wrong?"
331+
br[8,i] = n_gflop / @belapsed @avx @. $B = $A + $A'
332+
@assert B baseB "LoopVec wrong?"
333+
# if i % 10 == 0
334+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
335+
# @show percent_complete
336+
# end
337+
end
338+
br
339+
end
340+

benchmark/driver.jl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
1010

1111
using Distributed
1212

13-
addprocs(9);
13+
addprocs(10);
1414

1515
@everywhere begin
1616
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
@@ -28,16 +28,18 @@ dot3_future = @spawnat 7 benchmark_dot3(2:256);
2828
sse_future = @spawnat 8 benchmark_sse(2:256);
2929
exp_future = @spawnat 9 benchmark_exp(2:256);
3030
aplusBc_future = @spawnat 10 benchmark_aplusBc(2:256);
31+
AplusAt_future = @spawnat 11 benchmark_AplusAt(2:256);
3132

32-
gemm_bench = fetch(gemm_future)
33-
AtmulB_bench = fetch(AtmulB_future)
3433
dot_bench = fetch(dot_future)
3534
selfdot_bench = fetch(selfdot_future)
35+
AplusAt_bench = fetch(AplusAt_future)
3636
gemv_bench = fetch(gemv_future)
3737
dot3_bench = fetch(dot3_future)
3838
sse_bench = fetch(sse_future)
3939
exp_bench = fetch(exp_future)
4040
aplusBc_bench = fetch(aplusBc_future)
41+
gemm_bench = fetch(gemm_future)
42+
AtmulB_bench = fetch(AtmulB_future)
4143

4244

4345
plot(gemm_bench)
@@ -49,6 +51,7 @@ plot(dot3_bench)
4951
plot(sse_bench)
5052
plot(exp_bench)
5153
plot(aplusBc_bench)
54+
plot(AplusAt_bench)
5255

5356
save(joinpath("~/Pictures", "bench_gemm_v3.png"), plot(gemm_bench));
5457
save(joinpath("~/Pictures", "bench_AtmulB_v3.png"), plot(AtmulB_bench));

benchmark/loadsharedlibs.jl

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
1616
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
1717
end
1818
if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
19-
run(`icc -fast -qopt-zmm-usage=high -qopt-matmul -shared -fPIC $cfile -o $LIBICTEST`)
19+
run(`icc -fast -qopt-zmm-usage=high -shared -fPIC $cfile -o $LIBICTEST`)
2020
end
2121
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90")
2222
if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
2323
# --param max-unroll-times defaults to ≥8, which is generally excessive
2424
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
2525
end
2626
if !isfile(LIBIFTEST) || mtime(ffile) > mtime(LIBIFTEST)
27-
run(`ifort -fast -qopt-zmm-usage=high -qopt-matmul -shared -fPIC $ffile -o $LIBIFTEST`)
27+
run(`ifort -fast -qopt-zmm-usage=high -shared -fPIC $ffile -o $LIBIFTEST`)
2828
end
2929

3030
for (prefix,Cshared,Fshared) ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,LIBIFTEST))
@@ -223,4 +223,28 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
223223
s[]
224224
end
225225

226+
@eval function $(Symbol(prefix,:fAplusAt!))(B, A)
227+
N = size(B,1)
228+
ccall(
229+
(:AplusAt, $Fshared), Cvoid,
230+
(Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
231+
B, A, Ref(N)
232+
)
233+
end
234+
@eval function $(Symbol(prefix,:fAplusAtbuiltin!))(B, A)
235+
N = size(B,1)
236+
ccall(
237+
(:AplusAtbuiltin, $Fshared), Cvoid,
238+
(Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
239+
B, A, Ref(N)
240+
)
241+
end
242+
@eval function $(Symbol(prefix,:cAplusAt!))(B, A)
243+
N = size(B,1)
244+
ccall(
245+
(:AplusAt, $Cshared), Cvoid,
246+
(Ptr{Float64}, Ptr{Float64}, Clong),
247+
B, A, N
248+
)
249+
end
226250
end

benchmark/looptests.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,11 @@ double OLSlp(double* restrict y, double* restrict X, double* restrict b, long N,
171171
return lp;
172172
}
173173

174+
void AplusAt(double* restrict B, double* restrict A, long N){
175+
for (long i = 0; i < N; i++){
176+
for (long j = 0; j < N; j++){
177+
B[j + i*N] = A[j + i*N] + A[i + j*N];
178+
}
179+
}
180+
}
174181

benchmark/looptests.f90

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,4 +236,21 @@ subroutine OLSlp(lp, y, X, b, N, P) BIND(C, name="OLSlp")
236236
lp = lp + d*d
237237
end do
238238
end subroutine OLSlp
239-
end module looptests
239+
subroutine AplusAt(B, A, N) BIND(C, name="AplusAt")
240+
integer(C_long), intent(in) :: N
241+
real(C_double), dimension(N,N), intent(out) :: B
242+
real(C_double), dimension(N,N), intent(in) :: A
243+
integer(C_long) :: i, j
244+
do concurrent(i = 1:N)
245+
do concurrent(j = 1:N)
246+
B(j,i) = A(j,i) + A(i,j)
247+
end do
248+
end do
249+
end subroutine AplusAt
250+
subroutine AplusAtbuiltin(B, A, N) BIND(C, name="AplusAtbuiltin")
251+
integer(C_long), intent(in) :: N
252+
real(C_double), dimension(N,N), intent(out) :: B
253+
real(C_double), dimension(N,N), intent(in) :: A
254+
B = A + transpose(A)
255+
end subroutine AplusAtbuiltin
256+
end module looptests

src/lowering.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ function lower_compute!(
365365
# making BitArrays inefficient.
366366
# parentsyms = [opp.variable for opp ∈ parents(op)]
367367
Uiter = opunrolled ? U - 1 : 0
368-
maskreduct = mask !== nothing && isreduction(op) && any(opp -> opp.variable === var, parents_op)
368+
maskreduct = mask !== nothing && isreduction(op) && vectorized reduceddependencies(op) #any(opp -> opp.variable === var, parents_op)
369369
# if a parent is not unrolled, the compiler should handle broadcasting CSE.
370370
# because unrolled/tiled parents result in an unrolled/tiled dependendency,
371371
# we handle both the tiled and untiled case here.

0 commit comments

Comments
 (0)