Skip to content

Commit 6389458

Browse files
committed
Compute register pressure with Float64, and hopefully improve performance for @avx and user-defined functions.
1 parent 213f582 commit 6389458

File tree

7 files changed

+41
-34
lines changed

7 files changed

+41
-34
lines changed

benchmark/looptests.jl

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -278,26 +278,22 @@ end
278278

279279

280280
function filter2d!(out::AbstractMatrix, A::AbstractMatrix, kern)
281-
rng1k, rng2k = axes(kern)
282-
rng1, rng2 = axes(out)
283-
@inbounds @fastmath for j in rng2, i in rng1
281+
@inbounds @fastmath for J in CartesianIndices(out)
284282
tmp = zero(eltype(out))
285-
for jk in rng2k, ik in rng1k
286-
tmp += A[i+ik,j+jk]*kern[ik,jk]
283+
for I CartesianIndices(kern)
284+
tmp += A[I + J] * kern[I]
287285
end
288-
out[i,j] = tmp
286+
out[J] = tmp
289287
end
290288
out
291289
end
292290
function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
293-
rng1k, rng2k = axes(kern)
294-
rng1, rng2 = axes(out)
295-
@avx for j in rng2, i in rng1
291+
@avx for J in CartesianIndices(out)
296292
tmp = zero(eltype(out))
297-
for jk in rng2k, ik in rng1k
298-
tmp += A[i+ik,j+jk]*kern[ik,jk]
293+
for I CartesianIndices(kern)
294+
tmp += A[I + J] * kern[I]
299295
end
300-
out[i,j] = tmp
296+
out[J] = tmp
301297
end
302298
out
303299
end

src/condense_loopset.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,9 @@ function add_external_functions!(q::Expr, ls::LoopSet)
181181
for op operations(ls)
182182
if iscompute(op)
183183
instr = instruction(op)
184-
if instr.mod !== :LoopVectorization
184+
if instr.mod === :Main
185+
push!(q.args, instr.instr)
186+
elseif instr.mod !== :LoopVectorization
185187
push!(q.args, Expr(:(.), instr.mod, QuoteNode(instr.instr)))
186188
end
187189
end

src/costs.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ Base.convert(::Type{Expr}, instr::Instruction) = Expr(:(.), instr.mod, QuoteNode
1818
function Base.Expr(instr::Instruction, args...)
1919
if instr.mod === :LoopVectorization
2020
Expr(:call, lv(instr.instr), args...)::Expr
21+
elseif instr.mod === :Main
22+
Expr(:call, instr.instr, args...)::Expr
2123
else
2224
Expr(:call, convert(Expr, instr), args...)::Expr
2325
end

src/determinestrategy.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -349,10 +349,10 @@ end
349349
function solve_tilesize(
350350
ls::LoopSet, unrolled::Symbol, tiled::Symbol,
351351
cost_vec::AbstractVector{Float64},
352-
reg_pressure::AbstractVector{Int},
352+
reg_pressure::AbstractVector{Float64},
353353
W::Int, vectorized::Symbol
354354
)
355-
maxTbase = maxUbase = 4
355+
maxTbase = maxUbase = 4#8
356356
maxT = maxTbase#8
357357
maxU = maxUbase#8
358358
tiledloop = getloop(ls, tiled)
@@ -405,8 +405,8 @@ function stride_penalty(ls::LoopSet, order::Vector{Symbol})
405405
end
406406
stridepenalty# * 1e-9
407407
end
408-
function convolution_cost_factor(ls::LoopSet, op::Operation, u1::Symbol, u2::Symbol, v::Symbol)
409-
(u1 loopdependencies(op) && u2 loopdependencies(op)) || return 1.0
408+
function convolution_cost_factor(ls::LoopSet, op::Operation, u1::Symbol, u2::Symbol)
409+
(u1 loopdependencies(op) && u2 loopdependencies(op)) || return 1.0, 1.0
410410
istranslation = false
411411
inds = getindices(op); li = op.ref.loopedindex
412412
for i eachindex(li)
@@ -417,7 +417,7 @@ function convolution_cost_factor(ls::LoopSet, op::Operation, u1::Symbol, u2::Sym
417417
end
418418
end
419419
end
420-
istranslation ? 0.25 : 1.0
420+
istranslation ? (0.25, 1.0) : (1.0, 1.0)
421421
end
422422
# Just tile outer two loops?
423423
# But optimal order within tile must still be determined
@@ -484,8 +484,8 @@ function evaluate_cost_tile(
484484
istiled = unrolledtiled[2,id]
485485
rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
486486
if isload(op)
487-
factor = convolution_cost_factor(ls, op, unrolled, tiled, vectorized)
488-
rt *= factor#; rp *= factor;
487+
factor1, factor2 = convolution_cost_factor(ls, op, unrolled, tiled)
488+
rt *= factor1; rp *= factor2;
489489
end
490490
# @show op rt, lat, rp
491491
rp = opisininnerloop ? rp : 0 # we only care about register pressure within the inner most loop

src/graphs.jl

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ struct LoopSet
192192
syms_aliasing_refs::Vector{Symbol}
193193
refs_aliasing_syms::Vector{ArrayReferenceMeta}
194194
cost_vec::Matrix{Float64}
195-
reg_pres::Matrix{Int}
195+
reg_pres::Matrix{Float64}
196196
included_vars::Vector{Bool}
197197
place_after_loop::Vector{Bool}
198198
W::Symbol
@@ -296,7 +296,7 @@ function LoopSet(mod::Symbol)# = :LoopVectorization)
296296
Symbol[], Symbol[], Symbol[],
297297
ArrayReferenceMeta[],
298298
Matrix{Float64}(undef, 4, 2),
299-
Matrix{Int}(undef, 4, 2),
299+
Matrix{Float64}(undef, 4, 2),
300300
Bool[], Bool[],
301301
gensym(:W), gensym(:T), mod
302302
)
@@ -427,12 +427,6 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
427427
U = add_loop_bound!(ls, itersym, upper, true)
428428
Loop(itersym, L, U)
429429
end
430-
elseif f === :eachindex
431-
N = gensym(Symbol(:loopeachindex, itersym))
432-
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
433-
L = add_loop_bound!(ls, itersym, Expr(:call, lv(:maybestaticfirst), N), false)
434-
U = add_loop_bound!(ls, itersym, Expr(:call, lv(:maybestaticlast), N), true)
435-
Loop(itersym, L, U)
436430
elseif f === :OneTo || isscopedname(f, :Base, :OneTo)
437431
otN = r.args[2]
438432
if otN isa Integer
@@ -444,7 +438,11 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
444438
Loop(itersym, 1, N)
445439
end
446440
else
447-
throw("Unrecognized loop range type: $r.")
441+
N = gensym(Symbol(:loop, itersym))
442+
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
443+
L = add_loop_bound!(ls, itersym, Expr(:call, lv(:maybestaticfirst), N), false)
444+
U = add_loop_bound!(ls, itersym, Expr(:call, lv(:maybestaticlast), N), true)
445+
Loop(itersym, L, U)
448446
end
449447
elseif isa(r, Symbol)
450448
# Treat similar to `eachindex`

src/reconstruct_loopset.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ end
158158

159159
function num_parameters(AM)
160160
num_param::Int = AM[1]
161-
num_param += length(AM[2].parameters)
161+
# num_param += length(AM[2].parameters)
162162
num_param + length(AM[3].parameters)
163163
end
164164
function gen_array_syminds(AM)
@@ -335,11 +335,10 @@ function add_ops!(
335335
opsymbol = opsymbols[os.symid]
336336
add_op!(ls, instr[i], ops, nopsv, expandedv, i, mrefs, opsymbol, elementbytes)
337337
end
338-
num_params = add_parents_to_ops!(ls, ops, constoffset)
338+
add_parents_to_ops!(ls, ops, constoffset)
339339
# for op in operations(ls)
340340
# @show op
341341
# end
342-
num_params
343342
end
344343

345344
# elbytes(::VectorizationBase.AbstractPointer{T}) where {T} = sizeof(T)::Int
@@ -385,7 +384,7 @@ function avx_loopset(instr, ops, arf, AM, LPSYM, LB, vargs)
385384
pushpreamble!(ls, Expr(:(=), ls.T, Expr(:call, :promote_type, [Expr(:call, :eltype, vptr(mref)) for mref mrefs]...)))
386385
pushpreamble!(ls, Expr(:(=), ls.W, Expr(:call, lv(:pick_vector_width_val), [Expr(:call, :eltype, vptr(mref)) for mref mrefs]...)))
387386
num_params = num_arrays + num_parameters(AM)
388-
num_params = add_ops!(ls, instr, ops, mrefs, opsymbols, num_params, nopsv, expandedv, elementbytes)
387+
add_ops!(ls, instr, ops, mrefs, opsymbols, num_params, nopsv, expandedv, elementbytes)
389388
process_metadata!(ls, AM, length(arf))
390389
add_array_symbols!(ls, arraysymbolinds, num_arrays + length(ls.preamble_symsym))
391390
num_params = extract_external_functions!(ls, num_params)

test/miscellaneous.jl

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,17 @@
193193
ret[j] = clenshaw(x[j], coeff)
194194
end
195195
end
196-
196+
# ret = y2; coeff = c;
197+
# LoopVectorization.@avx_debug for j in 1:length(ret)
198+
# ret[j] = clenshaw(x[j], coeff)
199+
# end
200+
# t = β₁ = β₂ = ρ = s = 0.0; weights = rand(1); nodes = rand(1); lomnibus(args...) = +(args...)
201+
# LoopVectorization.@avx_debug for i ∈ eachindex(weights, nodes)
202+
# s += weights[i] * lomnibus(nodes[i], t, β₁, β₂, ρ)
203+
# end
204+
# @macroexpand @avx for i ∈ eachindex(weights, nodes)
205+
# s += weights[i] * lomnibus(nodes[i], t, β₁, β₂, ρ)
206+
# end
197207
function softmax3_core!(lse, qq, xx, tmpmax, maxk, nk)
198208
for k in Base.OneTo(maxk)
199209
@inbounds for i in eachindex(lse)

0 commit comments

Comments
 (0)