Skip to content

Commit 132ffd0

Browse files
committed
Merge branch 'master' into patch-1
2 parents 3888813 + 05a8124 commit 132ffd0

File tree

13 files changed

+181
-173
lines changed

13 files changed

+181
-173
lines changed

benchmark/looptests.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ void gemm_mnk(double* restrict C, double* restrict A, double* restrict B, long M
77
for (long m = 0; m < M; m++){
88
for (long n = 0; n < N; n++){
99
for (long k = 0; k < K; k++){
10-
C[m + n*M] += A[m + k*M] * B[k + n*K];
10+
C[m + n*M] += A[m + k*M] * B[k + n*K];
1111
}
1212
}
1313
}
@@ -20,7 +20,7 @@ void gemm_mkn(double* restrict C, double* restrict A, double* restrict B, long M
2020
for (long m = 0; m < M; m++){
2121
for (long k = 0; k < K; k++){
2222
for (long n = 0; n < N; n++){
23-
C[m + n*M] += A[m + k*M] * B[k + n*K];
23+
C[m + n*M] += A[m + k*M] * B[k + n*K];
2424
}
2525
}
2626
}
@@ -33,7 +33,7 @@ void gemm_nmk(double* restrict C, double* restrict A, double* restrict B, long M
3333
for (long n = 0; n < N; n++){
3434
for (long m = 0; m < M; m++){
3535
for (long k = 0; k < K; k++){
36-
C[m + n*M] += A[m + k*M] * B[k + n*K];
36+
C[m + n*M] += A[m + k*M] * B[k + n*K];
3737
}
3838
}
3939
}
@@ -46,7 +46,7 @@ void gemm_nkm(double* restrict C, double* restrict A, double* restrict B, long M
4646
for (long n = 0; n < N; n++){
4747
for (long k = 0; k < K; k++){
4848
for (long m = 0; m < M; m++){
49-
C[m + n*M] += A[m + k*M] * B[k + n*K];
49+
C[m + n*M] += A[m + k*M] * B[k + n*K];
5050
}
5151
}
5252
}
@@ -59,7 +59,7 @@ void gemm_kmn(double* restrict C, double* restrict A, double* restrict B, long M
5959
for (long k = 0; k < K; k++){
6060
for (long m = 0; m < M; m++){
6161
for (long n = 0; n < N; n++){
62-
C[m + n*M] += A[m + k*M] * B[k + n*K];
62+
C[m + n*M] += A[m + k*M] * B[k + n*K];
6363
}
6464
}
6565
}
@@ -72,7 +72,7 @@ void gemm_knm(double* restrict C, double* restrict A, double* restrict B, long M
7272
for (long k = 0; k < K; k++){
7373
for (long n = 0; n < N; n++){
7474
for (long m = 0; m < M; m++){
75-
C[m + n*M] += A[m + k*M] * B[k + n*K];
75+
C[m + n*M] += A[m + k*M] * B[k + n*K];
7676
}
7777
}
7878
}
@@ -85,7 +85,7 @@ void AtmulB(double* restrict C, double* restrict At, double* restrict B, long M,
8585
for (long n = 0; n < N; n++){
8686
for (long m = 0; m < M; m++){
8787
for (long k = 0; k < K; k++){
88-
C[m + n*M] += At[k + m*K] * B[k + n*K];
88+
C[m + n*M] += At[k + m*K] * B[k + n*K];
8989
}
9090
}
9191
}
@@ -98,7 +98,7 @@ void AmulBt(double* restrict C, double* restrict A, double* restrict Bt, long M,
9898
for (long k = 0; k < K; k++){
9999
for (long n = 0; n < N; n++){
100100
for (long m = 0; m < M; m++){
101-
C[m + n*M] += A[m + M*k] * Bt[n + N*k];
101+
C[m + n*M] += A[m + M*k] * Bt[n + N*k];
102102
}
103103
}
104104
}
@@ -111,7 +111,7 @@ void AtmulBt(double* restrict C, double* restrict A, double* restrict Bt, long M
111111
for (long n = 0; n < N; n++){
112112
for (long k = 0; k < K; k++){
113113
for (long m = 0; m < M; m++){
114-
C[m + n*M] += A[k + K*m] * Bt[n + N*k];
114+
C[m + n*M] += A[k + K*m] * Bt[n + N*k];
115115
}
116116
}
117117
}
@@ -249,9 +249,9 @@ void filter2d(double* restrict B, double* restrict A, double* restrict K, long M
249249
for (long ma = offset; ma < M-offset; ma++){
250250
double tmp = 0.0;
251251
for (long nk = -offset; nk < offset + 1; nk++){
252-
for (long mk = -offset; mk < offset + 1; mk++){
253-
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
254-
}
252+
for (long mk = -offset; mk < offset + 1; mk++){
253+
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
254+
}
255255
}
256256
B[(ma-offset) + (na-offset) * (M-2*offset)] = tmp;
257257
}
@@ -263,9 +263,9 @@ void filter2d3x3(double* restrict B, double* restrict A, double* restrict K, lon
263263
for (long ma = offset; ma < M-offset; ma++){
264264
double tmp = 0.0;
265265
for (long nk = -offset; nk < offset + 1; nk++){
266-
for (long mk = -offset; mk < offset + 1; mk++){
267-
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
268-
}
266+
for (long mk = -offset; mk < offset + 1; mk++){
267+
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
268+
}
269269
}
270270
B[(ma-offset) + (na-offset) * (M-2*offset)] = tmp;
271271
}

benchmark/openmp.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ void conv(double* B, double* A, double* K, long M, long N){
4545
for (long j = offset; j < M-offset; j++){
4646
double tmp = 0.0;
4747
for (long k = -offset; k < offset + 1; k++){
48-
for (long l = -offset; l < offset + 1; l++){
49-
tmp += A[(j+l) + (i+k)*M] * K[(l+offset) + (k+offset)*(2*offset+1)];
50-
}
48+
for (long l = -offset; l < offset + 1; l++){
49+
tmp += A[(j+l) + (i+k)*M] * K[(l+offset) + (k+offset)*(2*offset+1)];
50+
}
5151
}
5252
B[(j-offset) + (i-offset) * (M-2*offset)] = tmp;
5353
}

docs/src/examples/array_interface.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,10 @@ function runbenches(sr, ::Type{T}, fa = identity, fb = identity) where {T}
6161
sa = fill("StaticArrays", length(sr)); lv = fill("LoopVectorization", length(sr));
6262
matmul_lib = vcat(sa, lv, sa, lv);
6363
sizes = reduce(vcat, (sr for _ 1:4))
64-
DataFrame(
65-
Size = sizes, Time = vec(bench_results), GFLOPS = vec(gflops),
66-
ArrayType = array_type, MatmulLib = matmul_lib, MulType = array_type .* ' ' .* matmul_lib
67-
)
64+
DataFrame(
65+
Size = sizes, Time = vec(bench_results), GFLOPS = vec(gflops),
66+
ArrayType = array_type, MatmulLib = matmul_lib, MulType = array_type .* ' ' .* matmul_lib
67+
)
6868
end
6969

7070
df = runbenches(1:24, Float64);

docs/src/examples/matrix_multiplication.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ LoopVectorization currently doesn't do any memory-modeling or memory-based optim
66
We can write a single function:
77
```julia
88
function A_mul_B!(C, A, B)
9-
@avx for n indices((C,B), 2), m indices((C,A), 1)
9+
@avx for n indices((C,B), 2), m indices((C,A), 1)
1010
Cmn = zero(eltype(C))
1111
for k indices((A,B), (2,1))
1212
Cmn += C[m,k] * B[k,n]
1313
end
14-
C[m,n] = Cmn
14+
C[m,n] = Cmn
1515
end
1616
end
1717
```

docs/src/examples/multithreading.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,18 @@ relatively primitive arithmetic operations (e.g. `+`, `/`, or `log`), and not, f
3333
I'll make comparisons with OpenMP through the rest of this, starting with a simple dot product to focus on threading overhead:
3434
```julia
3535
function dotavxt(a::AbstractArray{T}, b::AbstractArray{T}) where {T <: Real}
36-
s = zero(T)
37-
@avxt for i eachindex(a,b)
38-
s += a[i] * b[i]
39-
end
40-
s
36+
s = zero(T)
37+
@avxt for i eachindex(a,b)
38+
s += a[i] * b[i]
39+
end
40+
s
4141
end
4242
function dotbaseline(a::AbstractArray{T}, b::AbstractArray{T}) where {T}
43-
s = zero(T)
44-
@fastmath @inbounds @simd for i eachindex(a,b)
45-
s += a[i]' * b[i]
46-
end
47-
s
43+
s = zero(T)
44+
@fastmath @inbounds @simd for i eachindex(a,b)
45+
s += a[i]' * b[i]
46+
end
47+
s
4848
end
4949
```
5050
In `C`:

docs/src/getting_started.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ function mvp(P, basis, coeffs::Vector{T}) where {T}
2222
end
2323
p += pc
2424
end
25-
p
25+
p
2626
end
2727

2828
maxdeg = 20; nbasis = 1_000; dim = 15;

docs/src/index.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@ Pages = [
1818
"examples/sum_of_squared_error.md",
1919
"vectorized_convenience_functions.md",
2020
"future_work.md",
21-
"devdocs/overview.md",
22-
"devdocs/loopset_structure.md",
23-
"devdocs/constructing_loopsets.md",
24-
"devdocs/evaluating_loops.md",
25-
"devdocs/lowering.md"
21+
"devdocs/overview.md",
22+
"devdocs/loopset_structure.md",
23+
"devdocs/constructing_loopsets.md",
24+
"devdocs/evaluating_loops.md",
25+
"devdocs/lowering.md"
2626
]
2727
Depth = 1
2828
```

src/modeling/determinestrategy.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11

2+
23
# function indexappearences(op::Operation, s::Symbol)
34
# s ∉ loopdependencies(op) && return 0
45
# appearences = 0
@@ -95,14 +96,15 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
9596
shifter = 2
9697
offset = 0.5reg_size(ls) / cache_lnsze(ls)
9798
end
98-
if !rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
99-
((u₁ === contigind) | (u₂ === contigind)))
99+
if shifter > 1 &&
100+
(!rejectcurly(op) && (((contigind === CONSTANTZEROINDEX) && ((length(indices) > 1) && (indices[2] === u₁) || (indices[2] === u₂))) ||
101+
((u₁ === contigind) | (u₂ === contigind))))
100102

101103
shifter -= 1
102104
offset = 0.5reg_size(ls) / cache_lnsze(ls)
103105
end
104106
r = 1 << shifter
105-
srt *= r + offset
107+
srt = srt*r + offset
106108
sl *= r
107109
elseif isload(op) & (length(loopdependencies(op)) > 1)# vmov(a/u)pd
108110
# penalize vectorized loads with more than 1 loopdep

src/simdfunctionals/filter.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeT
77
j = 0
88
st = VectorizationBase.static_sizeof(T)
99
zero_index = MM(W, Static(0), st)
10+
incr = W * VectorizationBase.static_sizeof(T)
1011
GC.@preserve x y begin
1112
# ptr_x = llvmptr(x); ptr_y = llvmptr(y)
1213
ptr_x = pointer(x); ptr_y = pointer(y)
1314
for _ 1:Nrep
1415
vy = VectorizationBase.__vload(ptr_y, zero_index, False(), register_size())
1516
mask = f(vy)
1617
VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
17-
ptr_y = gep(ptr_y, register_size())
18+
ptr_y = gep(ptr_y, incr)
1819
j = vadd_fast(j, count_ones(mask))
1920
end
2021
rem_mask = VectorizationBase.mask(T, Nrem)

test/copy.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,29 +27,29 @@ using LoopVectorization, OffsetArrays, Test
2727
end
2828
function offset_copy!(A, B)
2929
@inbounds for i=1:size(A,1), j=1:size(B,2)
30-
A[i,j+2] = B[i,j]
30+
A[i,j+2] = B[i,j]
3131
end
3232
end
3333
function offset_copyavx1!(A, B)
3434
@avx for i=1:size(A,1), j=1:size(B,2)
35-
A[i,j+2] = B[i,j]
35+
A[i,j+2] = B[i,j]
3636
end
3737
end
3838
function offset_copy_avx1!(A, B)
3939
@_avx for i=1:size(A,1), j=1:size(B,2)
40-
@inbounds A[i,j+2] = B[i,j]
40+
@inbounds A[i,j+2] = B[i,j]
4141
end
4242
end
4343
function offset_copyavx2!(A, B)
4444
@avx for i=1:size(A,1), j=1:size(B,2)
4545
Bᵢⱼ = B[i,j]
46-
A[i,j+2] = Bᵢⱼ
46+
A[i,j+2] = Bᵢⱼ
4747
end
4848
end
4949
function offset_copy_avx2!(A, B)
5050
@_avx for i=1:size(A,1), j=1:size(B,2)
5151
Bᵢⱼ = B[i,j]
52-
A[i,j+2] = Bᵢⱼ
52+
A[i,j+2] = Bᵢⱼ
5353
end
5454
end
5555
function make2point3avx!(x)

0 commit comments

Comments
 (0)