Skip to content

Commit 51b81f2

Browse files
authored
Merge pull request #118 from chriselrod/loopvecloweringoverhaul
Revamp lowering of code to improve code gen
2 parents 5f8f32b + 07a1bce commit 51b81f2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1516
-734
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1515
[compat]
1616
DocStringExtensions = "0.8"
1717
OffsetArrays = "1"
18-
SIMDPirates = "0.7.25"
19-
SLEEFPirates = "0.4.8"
18+
SIMDPirates = "0.8.3"
19+
SLEEFPirates = "0.5"
2020
UnPack = "0,1"
21-
VectorizationBase = "0.11.5"
21+
VectorizationBase = "0.12.1"
2222
julia = "1.1"
2323

2424
[extras]

benchmark/benchmarkflops.jl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ blastests() = [
107107
"LoopVectorization",
108108
"Julia", "Clang",
109109
"GFortran", "icc", "ifort",
110-
"g++ & Eigen-3", "icpc & Eigen-3",
110+
"g++ & Eigen-3", "clang++ & Eigen-3",
111111
"GFortran-builtin", "ifort-builtin",
112112
BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "MKL"
113113
]
@@ -160,7 +160,7 @@ function dot_bench!(br, s, i)
160160
br[9,i] = n_gflop / @belapsed dot($a, $b)
161161
end
162162
function benchmark_dot(sizes)
163-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
163+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
164164
br = BenchmarkResult(tests, sizes)
165165
sm = br.sizedresults.results
166166
pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -189,7 +189,7 @@ function selfdot_bench!(br, s, i)
189189
br[9,i] = n_gflop / @belapsed dot($a, $a)
190190
end
191191
function benchmark_selfdot(sizes)
192-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
192+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
193193
br = BenchmarkResult(tests, sizes)
194194
sm = br.sizedresults.results
195195
pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -273,11 +273,11 @@ function dot3_bench!(br, s, i)
273273
br[7,i] = n_gflop / @belapsed edot3($x, $A, $y)
274274
@assert edot3(x, A, y) dotblas "eigen dot wrong?"
275275
br[8,i] = n_gflop / @belapsed iedot3($x, $A, $y)
276-
@assert iedot3(x, A, y) dotblas "i-eigen dot wrong?"
276+
@assert iedot3(x, A, y) dotblas "c-eigen dot wrong?"
277277
br[9,i] = n_gflop / @belapsed dot($x, $A, $y)
278278
end
279279
function benchmark_dot3(sizes)
280-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LinearAlgebra" ]
280+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "LinearAlgebra" ]
281281
br = BenchmarkResult(tests, sizes)
282282
sm = br.sizedresults.results
283283
pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -316,7 +316,7 @@ function sse_bench!(br, s, i)
316316
br[9,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
317317
end
318318
function benchmark_sse(sizes)
319-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
319+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
320320
br = BenchmarkResult(tests, sizes)
321321
sm = br.sizedresults.results
322322
pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -370,7 +370,7 @@ function aplusBc_bench!(br, s, i)
370370
@assert D Dcopy "i-eigen wrong?"; fill!(D, NaN);
371371
end
372372
function benchmark_aplusBc(sizes)
373-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3"]
373+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3"]
374374
br = BenchmarkResult(tests, sizes)
375375
sm = br.sizedresults.results
376376
pmap(is -> aplusBc_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -402,7 +402,7 @@ function AplusAt_bench!(br, s, i)
402402
@assert B baseB "ifort-builtin wrong?"; fill!(B, NaN);
403403
end
404404
function benchmark_AplusAt(sizes)
405-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "GFortran-builtin", "ifort-builtin"]
405+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "GFortran-builtin", "ifort-builtin"]
406406
br = BenchmarkResult(tests, sizes)
407407
sm = br.sizedresults.results
408408
pmap(is -> AplusAt_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -453,9 +453,14 @@ function logdettriangle_bench!(br, s, i)
453453
@assert ld iclogdettriangle(U) "icc wrong?"
454454
br[6,i] = n_gflop / @belapsed iflogdettriangle($U)
455455
@assert ld iflogdettriangle(U) "ifort wrong?"
456+
# br[7,i] = n_gflop / @belapsed elogdettriangle($U)
457+
# @assert ld ≈ elogdettriangle(U) "eigen wrong?"; fill!(B, NaN);
458+
# br[8,i] = n_gflop / @belapsed ielogdettriangle($U)
459+
# @assert ld ≈ ielogdettriangle(U) "i-eigen wrong?"; fill!(B, NaN);
456460
br[7,i] = n_gflop / @belapsed logdet($U)
457461
end
458462
function benchmark_logdettriangle(sizes)
463+
# tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "LinearAlgebra"]
459464
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "LinearAlgebra"]
460465
br = BenchmarkResult(tests, sizes)
461466
sm = br.sizedresults.results

benchmark/driver.jl

Lines changed: 23 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
33
# includet(joinpath(LOOPVECBENCHDIR, "driver.jl"))
44

5-
using Distributed, LoopVectorization
5+
using Distributed, LoopVectorization, JLD2
66

77
const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
88
include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
@@ -22,35 +22,35 @@ end
2222
# sizes = 23:23
2323
sizes = 256:-1:2
2424

25-
AmulB_bench = benchmark_AmulB(sizes)
26-
AmulBt_bench = benchmark_AmulBt(sizes)
27-
AtmulBt_bench = benchmark_AtmulBt(sizes)
28-
AtmulB_bench = benchmark_AtmulB(sizes)
25+
logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
26+
dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
2927

30-
Amulvb_bench = benchmark_Amulvb(sizes)
31-
Atmulvb_bench = benchmark_Atmulvb(sizes)
28+
AmulB_bench = benchmark_AmulB(sizes); println("A * B benchmark results:"); println(AmulB_bench)
29+
AmulBt_bench = benchmark_AmulBt(sizes); println("A * B' benchmark results:"); println(AmulBt_bench)
30+
AtmulBt_bench = benchmark_AtmulBt(sizes); println("A' * B' benchmark results:"); println(AtmulBt_bench)
31+
AtmulB_bench = benchmark_AtmulB(sizes); println("A' * B benchmark results:"); println(AtmulB_bench)
3232

33-
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
34-
filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
35-
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
33+
Amulvb_bench = benchmark_Amulvb(sizes); println("A * b benchmark results:"); println(Amulvb_bench)
34+
Atmulvb_bench = benchmark_Atmulvb(sizes); println("A' * b benchmark results:"); println(Atmulvb_bench)
3635

37-
dot3_bench = benchmark_dot3(sizes)
38-
dot_bench = benchmark_dot(sizes)
39-
selfdot_bench = benchmark_selfdot(sizes)
40-
sse_bench = benchmark_sse(sizes)
41-
aplusBc_bench = benchmark_aplusBc(sizes)
42-
AplusAt_bench = benchmark_AplusAt(sizes)
43-
vexp_bench = benchmark_exp(sizes)
44-
randomaccess_bench = benchmark_random_access(sizes)
45-
logdettriangle_bench = benchmark_logdettriangle(sizes)
36+
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println("Benchmark results for dynamically sized 3x3 convolution:"); println(filter2d_dynamic_bench)
37+
filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println("Benchmark results for statically sized 3x3 convolution:"); println(filter2d_3x3_bench)
38+
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println("Benchmark results for unrolled 3x3 convolution:"); println(filter2d_unrolled_bench)
39+
40+
dot_bench = benchmark_dot(sizes); println("a' * b benchmark results:"); println(dot_bench)
41+
selfdot_bench = benchmark_selfdot(sizes); println("a' * a benchmark results:"); println(selfdot_bench)
42+
sse_bench = benchmark_sse(sizes); println("Benchmark resutls of summing squared error:"); println(sse_bench)
43+
aplusBc_bench = benchmark_aplusBc(sizes); println("Benchmark results of a .+ B .* c':"); println(aplusBc_bench)
44+
AplusAt_bench = benchmark_AplusAt(sizes); println("Benchmark results of A * A':"); println(AplusAt_bench)
45+
vexp_bench = benchmark_exp(sizes); println("Benchmark results of exponentiating a vector:"); println(vexp_bench)
46+
randomaccess_bench = benchmark_random_access(sizes); println("Benchmark results from using a vector of indices:"); println(randomaccess_bench)
4647

4748
const v = 1
4849
using Cairo, Fontconfig
4950
const PICTURES = joinpath(pkgdir(LoopVectorization), "docs", "src", "assets")
50-
function saveplot(f, br)
51-
draw(PNG(joinpath(PICTURES, f * "$v.png"), 12inch, 8inch), plot(br))
52-
end
51+
saveplot(f, br) = draw(PNG(joinpath(PICTURES, f * "$v.png"), 12inch, 8inch), plot(br))
5352

53+
saveplot("bench_logdettriangle_v", logdettriangle_bench);
5454
saveplot("bench_filter2d_dynamic_v", filter2d_dynamic_bench);
5555
saveplot("bench_filter2d_3x3_v", filter2d_3x3_bench);
5656
saveplot("bench_filter2d_unrolled_v", filter2d_unrolled_bench);
@@ -62,27 +62,12 @@ saveplot("bench_aplusBc_v", aplusBc_bench);
6262
saveplot("bench_AplusAt_v", AplusAt_bench);
6363
saveplot("bench_exp_v", vexp_bench);
6464
saveplot("bench_random_access_v", randomaccess_bench);
65-
saveplot("bench_logdettriangle_v", logdettriangle_bench);
6665
saveplot("bench_AmulB_v", AmulB_bench);
6766
saveplot("bench_AmulBt_v", AmulBt_bench);
6867
saveplot("bench_AtmulB_v", AtmulB_bench);
6968
saveplot("bench_AtmulBt_v", AtmulBt_bench);
7069
saveplot("bench_Amulvb_v", Amulvb_bench);
7170
saveplot("bench_Atmulvb_v", Atmulvb_bench);
7271

73-
74-
75-
76-
# plot(gemm_bench)
77-
# plot(AtmulB_bench)
78-
# plot(dot_bench)
79-
# plot(selfdot_bench)
80-
# plot(gemv_bench)
81-
# plot(dot3_bench)
82-
# plot(sse_bench)
83-
# plot(vexp_bench)
84-
# plot(aplusBc_bench)
85-
# plot(AplusAt_bench)
86-
87-
72+
@save "benchmarkresults.jld2" logdettriangle_bench filter2d_dynamic_bench filter2d_3x3_bench filter2d_unrolled_bench dot_bench selfdot_bench dot3_bench sse_bench aplusBc_bench AplusAt_bench vexp_bench randomaccess_bench AmulB_bench AmulBt_bench AtmulB_bench AtmulBt_bench Amulvb_bench Atmulvb_bench
8873

benchmark/loadsharedlibs.jl

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@ end
3535
eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
3636
if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
3737
# Clang seems to have trouble finding includes
38-
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
39-
# run(`clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
38+
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
39+
4040
end
4141
if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
42-
run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
42+
# run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
43+
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
44+
# run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4345
end
4446

4547
MKL_ROOT = "/home/chriselrod/intel"
@@ -382,16 +384,16 @@ end
382384
B, A, Ref(N)
383385
)
384386
end
385-
for (p,s) [(:c,Cshared) (:e,Eshared)]
386-
@eval function $(Symbol(prefix,p,:AplusAt!))(B, A)
387-
N = size(B,1)
388-
ccall(
389-
(:AplusAt, $s), Cvoid,
390-
(Ptr{Float64}, Ptr{Float64}, Clong),
391-
B, A, N
392-
)
387+
for (p,s) [(:c,Cshared) (:e,Eshared)]
388+
@eval function $(Symbol(prefix,p,:AplusAt!))(B, A)
389+
N = size(B,1)
390+
ccall(
391+
(:AplusAt, $s), Cvoid,
392+
(Ptr{Float64}, Ptr{Float64}, Clong),
393+
B, A, N
394+
)
395+
end
393396
end
394-
end
395397
@eval function $(Symbol(prefix,:crandomaccess))(P, basis, coefs)
396398
A, C = size(P)
397399
ccall(
@@ -408,14 +410,16 @@ end
408410
P, basis, coefs, Ref(A), Ref(C)
409411
)
410412
end
411-
@eval function $(Symbol(prefix,:clogdettriangle))(T::Union{LowerTriangular,UpperTriangular})
412-
N = size(T,1)
413-
Tp = parent(T)
414-
ccall(
415-
(:logdettriangle, $Cshared), Float64,
416-
(Ptr{Float64}, Clong),
417-
Tp, N
418-
)
413+
for (p,s) [(:c,Cshared) (:e,Eshared)]
414+
@eval function $(Symbol(prefix,p,:logdettriangle))(T::Union{LowerTriangular,UpperTriangular})
415+
N = size(T,1)
416+
Tp = parent(T)
417+
ccall(
418+
(:logdettriangle, $s), Float64,
419+
(Ptr{Float64}, Clong),
420+
Tp, N
421+
)
422+
end
419423
end
420424
@eval function $(Symbol(prefix,:flogdettriangle))(T::Union{LowerTriangular,UpperTriangular})
421425
N = size(T,1)

benchmark/looptests.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ double selfdot(double* restrict a, long N){
131131
}
132132
return s;
133133
}
134-
double dot3(double* restrict x, double* restrict A, double* restrict y, long M, long N){
134+
double dot3v2(double* restrict x, double* restrict A, double* restrict y, long M, long N){
135135
double s = 0.0;
136136
for (long n = 0; n < N; n++){
137137
for (long m = 0; m < M; m++){
@@ -140,7 +140,7 @@ double dot3(double* restrict x, double* restrict A, double* restrict y, long M,
140140
}
141141
return s;
142142
}
143-
double dot3v2(double* restrict x, double* restrict A, double* restrict y, long M, long N){
143+
double dot3(double* restrict x, double* restrict A, double* restrict y, long M, long N){
144144
double s = 0.0;
145145
for (long n = 0; n < N; n++){
146146
double t = 0.0;

benchmark/looptests.f90

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,30 +151,30 @@ real(C_double) function selfdot(a, N) BIND(C, name="selfdot")
151151
selfdot = selfdot + a(i) * a(i)
152152
end do
153153
end function selfdot
154-
real(C_double) function dot3(x, A, y, M, N) BIND(C, name="dot3")
154+
real(C_double) function dot3v2(x, A, y, M, N) BIND(C, name="dot3v2")
155155
integer(C_long), intent(in) :: M, N
156156
real(C_double), intent(in) :: x(M), A(M,N), y(N)
157157
real(C_double) :: t
158158
integer(C_long) :: mm, nn
159-
dot3 = 0.0d0
159+
dot3v2 = 0.0d0
160160
do concurrent(nn = 1:N, mm = 1:M)
161-
dot3 = dot3 + x(mm) * A(mm, nn) * y(nn)
161+
dot3v2 = dot3v2 + x(mm) * A(mm, nn) * y(nn)
162162
end do
163-
end function dot3
164-
real(C_double) function dot3v2(x, A, y, M, N) BIND(C, name="dot3v2")
163+
end function dot3v2
164+
real(C_double) function dot3(x, A, y, M, N) BIND(C, name="dot3")
165165
integer(C_long), intent(in) :: M, N
166166
real(C_double), intent(in) :: x(M), A(M,N), y(N)
167167
real(C_double) :: t
168168
integer(C_long) :: mm, nn
169-
dot3v2 = 0.0d0
169+
dot3 = 0.0d0
170170
do concurrent(nn = 1:N)
171171
t = 0.0d0
172172
do concurrent(mm = 1:M)
173173
t = t + x(mm) * A(mm, nn)
174174
end do
175-
dot3v2 = dot3v2 + t * y(nn)
175+
dot3 = dot3 + t * y(nn)
176176
end do
177-
end function dot3v2
177+
end function dot3
178178
real(C_double) function dot3builtin(x, A, y, M, N) BIND(C, name="dot3builtin")
179179
integer(C_long), intent(in) :: M, N
180180
real(C_double), intent(in) :: x(M), A(M,N), y(N)

benchmark/looptests.jl

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
using LoopVectorization, LinearAlgebra, OffsetArrays
22
BLAS.set_num_threads(1)
33

4+
using LoopVectorization.VectorizationBase: StaticUnitRange
45
struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: DenseMatrix{T}
56
data::Matrix{T}
67
end
7-
using LoopVectorization.VectorizationBase: StaticUnitRange
88
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
9+
Base.parent(A::SizedOffsetMatrix) = A.data
910
@generated function LoopVectorization.stridedpointer(A::SizedOffsetMatrix{T,LR,UR,LC,RC}) where {T,LR,UR,LC,RC}
1011
quote
1112
$(Expr(:meta,:inline))
1213
LoopVectorization.OffsetStridedPointer(
13-
LoopVectorization.StaticStridedPointer{$T,Tuple{1,$(UR-LR+1)}}(pointer(A.data)),
14-
($(LR-2), $(LC-2))
14+
LoopVectorization.StaticStridedPointer{$T,Tuple{1,$(UR-LR+1)}}(pointer(parent(A))),
15+
($(LR-1), $(LC-1))
1516
)
1617
end
1718
end
19+
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i-1,j-1))
20+
Base.axes(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (StaticUnitRange{LR,UR}(),StaticUnitRange{LC,UC}())
1821
Base.size(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (1 + UR-LR, 1 + UC-LC)
19-
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j)) # only needed to print
2022
Base.unsafe_convert(::Type{Ptr{Float64}}, A::SizedOffsetMatrix) = Base.unsafe_convert(Ptr{Float64}, A.data)
2123

2224

@@ -97,40 +99,38 @@ function jselfdotavx(a)
9799
end
98100
s
99101
end
100-
function jdot3(x, A, y)
102+
function jdot3v2(x, A, y)
101103
M, N = size(A)
102104
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
103105
@inbounds @fastmath for n 1:N, m 1:M
104106
s += x[m] * A[m,n] * y[n]
105107
end
106108
s
107109
end
108-
function jdot3avx(x, A, y)
110+
function jdot3v2avx(x, A, y)
109111
M, N = size(A)
110112
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
111113
@avx for n 1:N, m 1:M
112114
s += x[m] * A[m,n] * y[n]
113115
end
114116
s
115117
end
116-
function jdot3v2(x, A, y)
117-
M, N = size(A)
118+
function jdot3(x, A, y)
118119
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
119-
@inbounds @fastmath for n 1:N
120+
@inbounds @fastmath for n axes(A,2)
120121
t = zero(s)
121-
@simd ivdep for m 1:M
122+
@simd ivdep for m axes(A,1)
122123
t += x[m] * A[m,n]
123124
end
124125
s += t * y[n]
125126
end
126127
s
127128
end
128-
function jdot3v2avx(x, A, y)
129-
M, N = size(A)
129+
function jdot3avx(x, A, y)
130130
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
131-
@avx for n 1:N
131+
@avx for n axes(A,2)
132132
t = zero(s)
133-
for m 1:M
133+
for m axes(A,1)
134134
t += x[m] * A[m,n]
135135
end
136136
s += t * y[n]

0 commit comments

Comments
 (0)