Skip to content

Commit ee37e09

Browse files
committed
Make shuffleloadstore functions nicer, only ignore cost of +/- loop-induct-var dependentent compute ops
1 parent ac80508 commit ee37e09

File tree

5 files changed

+87
-27
lines changed

5 files changed

+87
-27
lines changed

benchmark/openmp.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#include<omp.h>
2+
3+
double dot(double* a, double* b, long N){
4+
double s = 0.0;
5+
#pragma omp parallel for reduction(+: s)
6+
for(long n = 0; n < N; n++){
7+
s += a[n]*b[n];
8+
}
9+
return s;
10+
}
11+
12+
void cdot(double* c, double* a, double* b, long N){
13+
double r = 0.0, i = 0.0;
14+
#pragma omp parallel for reduction(+: r, i)
15+
for(long n = 0; n < N; n++){
16+
r += a[2*n] * b[2*n ] + a[2*n+1] * b[2*n+1];
17+
i += a[2*n] * b[2*n+1] - a[2*n+1] * b[2*n ];
18+
}
19+
c[0] = r;
20+
c[1] = i;
21+
return;
22+
}
23+
24+
void cdot3(double* c, double* x, double* A, double* y, long M, long N){
25+
double sr = 0.0, si = 0.0;
26+
#pragma omp parallel for reduction(+: sr, si)
27+
for (long n = 0; n < N; n++){
28+
double tr = 0.0, ti = 0.0;
29+
for(long m = 0; m < M; m++){
30+
tr += x[2*m] * A[2*m + n*N] + x[2*m+1] * A[2*m+1 + n*N];
31+
ti += x[2*m] * A[2*m+1 + n*N] - x[2*m+1] * A[2*m + n*N];
32+
}
33+
sr += tr * y[2*n ] - ti * y[2*n+1];
34+
si += tr * y[2*n+1] + ti * y[2*n ];
35+
}
36+
c[0] = sr;
37+
c[1] = si;
38+
return;
39+
}
40+
41+
void conv(double* B, double* A, double* K, long M, long N){
42+
const long offset = 2;
43+
#pragma omp parallel for collapse(2)
44+
for (long i = offset; i < N-offset; i++){
45+
for (long j = offset; j < M-offset; j++){
46+
double tmp = 0.0;
47+
for (long k = -offset; k < offset + 1; k++){
48+
for (long l = -offset; l < offset + 1; l++){
49+
tmp += A[(j+l) + (i+k)*M] * K[(l+offset) + (k+offset)*(2*offset+1)];
50+
}
51+
}
52+
B[(j-offset) + (i-offset) * (M-2*offset)] = tmp;
53+
}
54+
}
55+
return;
56+
}
57+
58+

src/codegen/lower_threads.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,11 @@ end
138138
end
139139

140140
# if a threaded loop is vectorized, call
141-
function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
141+
@inline function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
142142
_choose_num_blocks(M % UInt, StaticInt{U}(), nt, lv_max_num_threads())
143143
end
144144
# otherwise, call
145-
choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} = @inbounds choose_num_block_table(StaticInt{NC}())[nt]
145+
@inline choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} = @inbounds choose_num_block_table(StaticInt{NC}())[nt]
146146

147147

148148

@@ -160,7 +160,7 @@ choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} = @inbo
160160
# block_per_m, blocks_per_n
161161
# end
162162

163-
function choose_num_threads(::Val{C}, ::Val{NT}, x) where {C,NT}
163+
@inline function choose_num_threads(::Val{C}, ::Val{NT}, x) where {C,NT}
164164
fx = Base.uitofp(Float64, x)
165165
min(Base.fptoui(UInt, Base.ceil_llvm(0.05460264079015985*C*Base.sqrt_llvm(fx))), NT)
166166
end
@@ -194,7 +194,7 @@ function push_loop_length_expr!(q::Expr, ls::LoopSet)
194194
end
195195
nothing
196196
end
197-
function divrem_fast(numerator, denominator)
197+
@inline function divrem_fast(numerator, denominator)
198198
d = Base.udiv_int(numerator, denominator)
199199
r = numerator - denominator*d
200200
d, r

src/modeling/determinestrategy.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
7474
if instr == Instruction(:-) || instr === Instruction(:sub_fast) || instr == Instruction(:+) || instr == Instruction(:add_fast)
7575
return 0.0, 0, 0.0
7676
end
77-
elseif iscompute(op) && all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
77+
elseif iscompute(op) &&
78+
Base.sym_in(instruction(op).instr, (:(+), :(-), :add_fast, :sub_fast)) &&
79+
all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
7880
return 0.0, 0, 0.0
7981
end
8082
opisvectorized = isvectorized(op)
@@ -189,7 +191,8 @@ function evaluate_cost_unroll(
189191
included_vars[id] = true
190192
# @show op, cost(ls, op, vloopsym, Wshift, size_T)
191193
# TODO: use actual unrolls here?
192-
total_cost += iter * first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
194+
c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
195+
total_cost += iter * c
193196
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
194197
end
195198
end

test/shuffleloadstores.jl

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,34 +8,37 @@ function dot_simd(a::AbstractVector, b::AbstractVector)
88
end
99
s
1010
end
11-
function cdot_mat(a::AbstractMatrix, b::AbstractMatrix)
12-
re = zero(eltype(a))
13-
im = zero(eltype(a))
11+
function cdot_mat(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
12+
a = reinterpret(reshape, T, ca)
13+
b = reinterpret(reshape, T, cb)
14+
re = zero(T); im = zero(T)
1415
@avx for i axes(a,2)
1516
re += a[1,i] * b[1,i] + a[2,i] * b[2,i]
1617
im += a[1,i] * b[2,i] - a[2,i] * b[1,i]
1718
end
18-
Complex(re,im)
19+
Complex(re, im)
1920
end
20-
function cdot_affine(a::AbstractVector, b::AbstractVector)
21-
re = zero(eltype(a))
22-
im = zero(eltype(a))
21+
function cdot_affine(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
22+
a = reinterpret(T, ca);
23+
b = reinterpret(T, cb);
24+
re = zero(T); im = zero(T)
2325
# with a multiplier, we go from `i = 1 -> 2i = 2` to `i = 0 -> 2i = 0
2426
# 2(i+1-1) = 2i + 2 - 2, so....
2527
@avx for i 1:length(a)>>>1
2628
re += a[2i-1] * b[2i-1] + a[2i] * b[2i ]
2729
im += a[2i-1] * b[2i ] - a[2i] * b[2i-1]
2830
end
29-
Complex(re,im)
31+
Complex(re, im)
3032
end
31-
function cdot_stride(a::AbstractVector, b::AbstractVector)
32-
re = zero(eltype(a))
33-
im = zero(eltype(a))
33+
function cdot_stride(ca::AbstractVector{Complex{T}}, cb::AbstractVector{Complex{T}}) where {T}
34+
a = reinterpret(T, ca);
35+
b = reinterpret(T, cb);
36+
re = zero(T); im = zero(T)
3437
@avx for i 1:2:length(a)
3538
re += a[i] * b[i ] + a[i+1] * b[i+1]
3639
im += a[i] * b[i+1] - a[i+1] * b[i ]
3740
end
38-
Complex(re,im)
41+
Complex(re, im)
3942
end
4043
function qdot_simd(x::AbstractVector{NTuple{4,T}}, y::AbstractVector{NTuple{4,T}}) where {T}
4144
a = zero(T)
@@ -132,15 +135,11 @@ end
132135
for i 1:128
133136
ac = rand(Complex{Float64}, i);
134137
bc = rand(Complex{Float64}, i);
135-
acv = reinterpret(Float64, ac);
136-
bcv = reinterpret(Float64, bc);
137138
dsimd = dot_simd(ac, bc)
138139
if VERSION v"1.6.0-rc1"
139-
acm = reinterpret(reshape, Float64, ac);
140-
bcm = reinterpret(reshape, Float64, bc);
141-
@test dsimd cdot_mat(acm, bcm)
140+
@test dsimd cdot_mat(ac, bc)
142141
end
143-
@test dsimd cdot_affine(acv, bcv) cdot_stride(acv, bcv)
142+
@test dsimd cdot_affine(ac, bc) cdot_stride(ac, bc)
144143

145144

146145
xq = [ntuple(_ -> rand(), Val(4)) for _ 1:i];

test/threading.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ function AmulB!(C,A,B)
1717
C
1818
end
1919
function dot3(x::AbstractVector{Complex{T}}, A::AbstractMatrix{Complex{T}}, y::AbstractVector{Complex{T}}) where {T}
20-
xr = reinterpret(reshape, Float64, x);
21-
yr = reinterpret(reshape, Float64, y);
22-
Ar = reinterpret(reshape, Float64, A);
20+
xr = reinterpret(reshape, T, x);
21+
yr = reinterpret(reshape, T, y);
22+
Ar = reinterpret(reshape, T, A);
2323
sre = zero(T)
2424
sim = zero(T)
2525
@avxt for n in axes(Ar,3)

0 commit comments

Comments
 (0)