Skip to content

Commit 6185381

Browse files
committed
Try to improve performance for simple single loops when masks aren't needed, and revamp benchmarks.
1 parent 182a3a7 commit 6185381

File tree

9 files changed

+162
-83
lines changed

9 files changed

+162
-83
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.8.1"
4+
version = "0.8.2"
55

66
[deps]
77
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -18,7 +18,7 @@ OffsetArrays = "1"
1818
SIMDPirates = "0.8.4"
1919
SLEEFPirates = "0.5"
2020
UnPack = "0,1"
21-
VectorizationBase = "0.12.2"
21+
VectorizationBase = "0.12.4"
2222
julia = "1.1"
2323

2424
[extras]

benchmark/benchmarkflops.jl

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,10 @@ function matmul_bench!(br, C, A, B, i)
6161
@assert C Cblas "Fort builtin gemm wrong?"; fill!(C, NaN)
6262
br[10,i] = n_gflop / @belapsed ifgemm_builtin!($C, $A, $B)
6363
@assert C Cblas "ifort builtin gemm wrong?"; fill!(C, NaN)
64-
br[11,i] = n_gflop / @belapsed mul!($C, $A, $B);
65-
fill!(C, NaN)
64+
br[11,i] = n_gflop / @belapsed dgemmopenblas!($C, $A, $B);
65+
@assert C Cblas "OpenBLAS gemm wrong?"
6666
br[12,i] = n_gflop / @belapsed dgemmmkl!($C, $A, $B)
67-
@assert C Cblas "MKL JIT gemm wrong?"
67+
@assert C Cblas "MKL gemm wrong?"
6868
# br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
6969
end
7070
function A_mul_B_bench!(br, s, i)
@@ -109,7 +109,7 @@ blastests() = [
109109
"GFortran", "icc", "ifort",
110110
"g++ & Eigen-3", "clang++ & Eigen-3",
111111
"GFortran-builtin", "ifort-builtin",
112-
BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "MKL"
112+
"OpenBLAS", "MKL"
113113
]
114114

115115
function benchmark_AmulB(sizes)
@@ -160,7 +160,7 @@ function dot_bench!(br, s, i)
160160
br[9,i] = n_gflop / @belapsed dot($a, $b)
161161
end
162162
function benchmark_dot(sizes)
163-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
163+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
164164
br = BenchmarkResult(tests, sizes)
165165
sm = br.sizedresults.results
166166
pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -189,7 +189,7 @@ function selfdot_bench!(br, s, i)
189189
br[9,i] = n_gflop / @belapsed dot($a, $a)
190190
end
191191
function benchmark_selfdot(sizes)
192-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
192+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
193193
br = BenchmarkResult(tests, sizes)
194194
sm = br.sizedresults.results
195195
pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -222,7 +222,8 @@ function gemv_bench!(br, x, A, y, i)
222222
@assert x xblas "Fort wrong?"; fill!(x, NaN);
223223
br[10,i] = n_gflop / @belapsed ifgemv_builtin!($x, $A, $y)
224224
@assert x xblas "ifort wrong?"; fill!(x, NaN);
225-
br[11,i] = n_gflop / @belapsed mul!($x, $A, $y)
225+
br[11,i] = n_gflop / @belapsed dgemvopenblas!($x, $A, $y)
226+
@assert x xblas "gemvopenblas wrong?"; fill!(x, NaN);
226227
br[12,i] = n_gflop / @belapsed dgemvmkl!($x, $A, $y)
227228
@assert x xblas "gemvmkl wrong?"; fill!(x, NaN);
228229
end
@@ -316,7 +317,7 @@ function sse_bench!(br, s, i)
316317
br[9,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
317318
end
318319
function benchmark_sse(sizes)
319-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
320+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
320321
br = BenchmarkResult(tests, sizes)
321322
sm = br.sizedresults.results
322323
pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))

benchmark/driver.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ end
2121

2222
# sizes = 23:23
2323
sizes = 256:-1:2
24+
longsizes = 512:-1:2
2425

2526
logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
2627
dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
@@ -37,8 +38,8 @@ filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println("Benchmark re
3738
filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println("Benchmark results for statically sized 3x3 convolution:"); println(filter2d_3x3_bench)
3839
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println("Benchmark results for unrolled 3x3 convolution:"); println(filter2d_unrolled_bench)
3940

40-
dot_bench = benchmark_dot(sizes); println("a' * b benchmark results:"); println(dot_bench)
41-
selfdot_bench = benchmark_selfdot(sizes); println("a' * a benchmark results:"); println(selfdot_bench)
41+
dot_bench = benchmark_dot(longsizes); println("a' * b benchmark results:"); println(dot_bench)
42+
selfdot_bench = benchmark_selfdot(longsizes); println("a' * a benchmark results:"); println(selfdot_bench)
4243
sse_bench = benchmark_sse(sizes); println("Benchmark resutls of summing squared error:"); println(sse_bench)
4344
aplusBc_bench = benchmark_aplusBc(sizes); println("Benchmark results of a .+ B .* c':"); println(aplusBc_bench)
4445
AplusAt_bench = benchmark_AplusAt(sizes); println("Benchmark results of A * A':"); println(AplusAt_bench)

benchmark/loadsharedlibs.jl

Lines changed: 93 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,9 @@
1-
using LinearAlgebra, LoopVectorization
1+
using LinearAlgebra, LoopVectorization, Libdl
22
using LoopVectorization.VectorizationBase: REGISTER_SIZE
33

44
# const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
55
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
66

7-
const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
8-
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
9-
const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
10-
const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
11-
const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
12-
const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
13-
const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
147

158
# requires Clang with polly to build
169
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
@@ -44,43 +37,108 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
4437
# run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4538
end
4639

47-
MKL_ROOT = "/home/chriselrod/intel"
48-
directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
49-
if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
50-
run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
51-
# run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
40+
# MKL_ROOT = "/home/chriselrod/intel"
41+
# directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
42+
# if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
43+
# run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
44+
# # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
5245

53-
# run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
54-
end
46+
# # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
47+
# end
5548

56-
istransposed(x) = false
57-
istransposed(x::Adjoint) = true
58-
istransposed(x::Transpose) = true
59-
"""
60-
If transposed, requires them to be square
61-
"""
49+
50+
const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
51+
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
52+
const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
53+
const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
54+
const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
55+
const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
56+
57+
using MKL_jll, OpenBLAS_jll
58+
59+
const libMKL = Libdl.dlopen(MKL_jll.libmkl_rt)
60+
const DGEMM_MKL = Libdl.dlsym(libMKL, :dgemm)
61+
const DGEMV_MKL = Libdl.dlsym(libMKL, :dgemv)
62+
const MKL_SET_NUM_THREADS = Libdl.dlsym(libMKL, :MKL_Set_Num_Threads)
63+
64+
const libOpenBLAS = Libdl.dlopen(OpenBLAS_jll.libopenblas)
65+
const DGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemm_64_)
66+
const DGEMV_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemv_64_)
67+
const OPENBLAS_SET_NUM_THREADS = Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_)
68+
69+
istransposed(x) = 'N'
70+
istransposed(x::Adjoint{<:Real}) = 'T'
71+
istransposed(x::Adjoint) = 'C'
72+
istransposed(x::Transpose) = 'T'
6273
function dgemmmkl!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
74+
transA = istransposed(A)
75+
transB = istransposed(B)
76+
M, N = size(C); K = size(B, 1)
77+
M32 = M % Int32
78+
K32 = K % Int32
79+
N32 = N % Int32
80+
pA = parent(A); pB = parent(B)
81+
ldA = stride(pA, 2) % Int32
82+
ldB = stride(pB, 2) % Int32
83+
ldC = stride(C, 2) % Int32
84+
α = 1.0
85+
β = 0.0
86+
ccall(
87+
DGEMM_MKL, Cvoid,
88+
(Ref{UInt8}, Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}),
89+
transA, transB, M32, N32, K32, α, pA, ldA, pB, ldB, β, C, ldC
90+
)
91+
end
92+
function dgemmopenblas!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
93+
transA = istransposed(A)
94+
transB = istransposed(B)
6395
M, N = size(C); K = size(B, 1)
96+
pA = parent(A); pB = parent(B)
97+
ldA = stride(pA, 2)
98+
ldB = stride(pB, 2)
99+
ldC = stride(C, 2)
100+
α = 1.0
101+
β = 0.0
64102
ccall(
65-
(:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
66-
(Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
67-
parent(C), parent(A), parent(B),
68-
Ref(M), Ref(K), Ref(N),
69-
Ref(istransposed(A)), Ref(istransposed(B))
103+
DGEMM_OpenBLAS, Cvoid,
104+
(Ref{UInt8}, Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}),
105+
transA, transB, M, N, K, α, pA, ldA, pB, ldB, β, C, ldC
70106
)
71107
end
72-
mkl_set_num_threads(N::Integer) = ccall((:set_num_threads, LIBDIRECTCALLJIT), Cvoid, (Ref{UInt32},), Ref(N % UInt32))
108+
mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Ref{UInt32},), Ref(N % UInt32))
73109
mkl_set_num_threads(1)
74-
"""
75-
If transposed, requires them to be square
76-
"""
110+
openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Ref{Int64},), Ref(N))
111+
openblas_set_num_threads(1)
77112
function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
78-
M, N = size(A);
113+
transA = istransposed(A)
114+
pA = parent(A)
115+
M, N = size(pA)
116+
M32 = M % Int32
117+
N32 = N % Int32
118+
ldA = stride(pA, 2) % Int32
119+
incx = LinearAlgebra.stride1(x) % Int32
120+
incy = LinearAlgebra.stride1(y) % Int32
121+
α = 1.0
122+
β = 0.0
123+
ccall(
124+
DGEMV_MKL, Cvoid,
125+
(Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}),
126+
transA, M32, N32, α, A, ldA, x, incx, β, y, incy
127+
)
128+
end
129+
function dgemvopenblas!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
130+
transA = istransposed(A)
131+
pA = parent(A)
132+
M, N = size(pA)
133+
ldA = stride(pA, 2)
134+
incx = LinearAlgebra.stride1(x)
135+
incy = LinearAlgebra.stride1(y)
136+
α = 1.0
137+
β = 0.0
79138
ccall(
80-
(:dgemvjit, LIBDIRECTCALLJIT), Cvoid,
81-
(Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Bool}),
82-
parent(y), parent(A), parent(x),
83-
Ref(M), Ref(N), Ref(istransposed(A))
139+
DGEMV_OpenBLAS, Cvoid,
140+
(Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}),
141+
transA, M, N, α, A, ldA, x, incx, β, y, incy
84142
)
85143
end
86144

benchmark/looptests.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,8 @@ void filter2d(double* restrict B, double* restrict A, double* restrict K, long M
248248
for (long na = offset; na < N-offset; na++){
249249
for (long ma = offset; ma < M-offset; ma++){
250250
double tmp = 0.0;
251-
for (long nk = -offset; nk < offset + 1; nk++){
252-
for (long mk = -offset; mk < offset + 1; mk++){
251+
for (long mk = -offset; mk < offset + 1; mk++){
252+
for (long nk = -offset; nk < offset + 1; nk++){
253253
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
254254
}
255255
}
@@ -262,8 +262,8 @@ void filter2d3x3(double* restrict B, double* restrict A, double* restrict K, lon
262262
for (long na = offset; na < N-offset; na++){
263263
for (long ma = offset; ma < M-offset; ma++){
264264
double tmp = 0.0;
265-
for (long nk = -offset; nk < offset + 1; nk++){
266-
for (long mk = -offset; mk < offset + 1; mk++){
265+
for (long mk = -offset; mk < offset + 1; mk++){
266+
for (long nk = -offset; nk < offset + 1; nk++){
267267
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
268268
}
269269
}

benchmark/looptests.f90

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,10 @@ subroutine filter2d(B, A, K, Ma, Na, offset) BIND(C, name="filter2d")
324324
real(C_double) :: tmp
325325
do concurrent(mma = 1+offset:Ma-offset, nna = 1+offset:Na-offset)
326326
tmp = 0
327-
do concurrent(nnk = -offset:offset, mmk = -offset:offset)
328-
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
327+
do mmk = -offset,offset
328+
do nnk = -offset,offset
329+
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
330+
end do
329331
end do
330332
B(mma,nna) = tmp
331333
end do
@@ -340,8 +342,10 @@ subroutine filter2d3x3(B, A, K, Ma, Na) BIND(C, name="filter2d3x3")
340342
real(C_double) :: tmp
341343
do concurrent(mma = 1+offset:Ma-offset, nna = 1+offset:Na-offset)
342344
tmp = 0
343-
do concurrent(nnk = -offset:offset, mmk = -offset:offset)
344-
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
345+
do mmk = -offset,offset
346+
do nnk = -offset,offset
347+
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
348+
end do
345349
end do
346350
B(mma,nna) = tmp
347351
end do

src/LoopVectorization.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
module LoopVectorization
22

3+
if (!isnothing(get(ENV, "TRAVIS_BRANCH", nothing)) || !isnothing(get(ENV, "APPVEYOR", nothing))) && isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel"))
4+
@eval Base.Experimental.@optlevel 1
5+
end
6+
37
using VectorizationBase, SIMDPirates, SLEEFPirates, UnPack, OffsetArrays
48
using VectorizationBase: REGISTER_SIZE, extract_data, num_vector_load_expr,
59
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valmulsub, valadd, valsub, _MM,

src/lower_store.jl

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,19 @@ end
4343
# variable_name(op::Operation, suffix) = Symbol(mangledvar(op), suffix, :_)
4444
# # variable_name(op::Operation, suffix, u::Int) = (n = variable_name(op, suffix); u < 0 ? n : Symbol(n, u))
4545
function reduce_range!(q::Expr, toreduct::Symbol, instr::Instruction, Uh::Int, Uh2::Int)
46-
for u Uh:Uh2-1
47-
tru = Symbol(toreduct, u - Uh)
48-
push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
49-
end
50-
for u 2Uh:Uh2-1
51-
tru = Symbol(toreduct, u - 2Uh)
52-
push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
46+
if 2Uh == Uh2
47+
for u 0:2:Uh2-1
48+
push!(q.args, Expr(:(=), Symbol(toreduct, (u>>>1)), Expr(instr, Symbol(toreduct, u), Symbol(toreduct, u + 1))))
49+
end
50+
else
51+
for u Uh:Uh2-1
52+
tru = Symbol(toreduct, u - Uh)
53+
push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
54+
end
55+
for u 2Uh:Uh2-1
56+
tru = Symbol(toreduct, u - 2Uh)
57+
push!(q.args, Expr(:(=), tru, Expr(instr, tru, Symbol(toreduct, u))))
58+
end
5359
end
5460
end
5561
function reduce_range!(q::Expr, ls::LoopSet, Ulow::Int, Uhigh::Int)

0 commit comments

Comments
 (0)