Skip to content

Commit 4fe3575

Browse files
committed
each thread that ntstores should fence, and consider reg reuse in offset loads.
1 parent 3c51cd2 commit 4fe3575

File tree

2 files changed

+66
-60
lines changed

2 files changed

+66
-60
lines changed

src/modeling/determinestrategy.jl

Lines changed: 45 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -792,46 +792,48 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols
792792
mno, i
793793
end
794794
function load_elimination_cost_factor!(
795-
cost_vec, reg_pressure, choose_to_inline, ls::LoopSet, op::Operation, iters, unrollsyms::UnrollSymbols, Wshift, size_T
795+
cost_vec, reg_pressure, choose_to_inline, ls::LoopSet, op::Operation, iters, unrollsyms::UnrollSymbols, Wshift, size_T
796796
)
797-
@unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
798-
if !iszero(first(isoptranslation(ls, op, unrollsyms)))
799-
rt, lat, rp = cost(ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
800-
# rt = Core.ifelse(isvectorized(op), 0.5rt, rt)
801-
rto = rt
802-
rt *= iters
803-
# rt *= factor1; rp *= factor2;
804-
choose_to_inline[] = true
805-
# for loop ∈ ls.loops
806-
# # If another loop is short, assume that LLVM will unroll it, in which case
807-
# # we want to be a little more conservative in terms of register pressure.
808-
# #FIXME: heuristic hack to get some desired behavior.
809-
# if isstaticloop(loop) && length(loop) ≤ 4
810-
# itersym = loop.itersymbol
811-
# if itersym !== u₁loopsym && itersym !== u₂loopsym
812-
# return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
813-
# # return (0.25, 1.0)
814-
# return true
815-
# end
816-
# end
817-
# end
818-
# # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
819-
# (0.25, 1.0)
820-
# cost_vec[1] -= rt
821-
# cost_vec[1] -= 0.5625 * iters
822-
# cost_vec[1] -= 0.5625 * iters / 2
823-
# @show rto, 0.8rt, op
824-
reg_pressure[1] += 0.25rp
825-
cost_vec[2] += rt
826-
reg_pressure[2] += rp
827-
cost_vec[3] += rt
828-
# currently only place `reg_pressure[3]` is updated
829-
reg_pressure[3] += rp
830-
true
831-
else
832-
(1.0, 1.0)
833-
false
834-
end
797+
@unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
798+
if !iszero(first(isoptranslation(ls, op, unrollsyms)))
799+
rt, lat, rp = cost(ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
800+
# rt = Core.ifelse(isvectorized(op), 0.5rt, rt)
801+
rto = rt
802+
rt *= iters
803+
# rt *= factor1; rp *= factor2;
804+
choose_to_inline[] = true
805+
# for loop ∈ ls.loops
806+
# # If another loop is short, assume that LLVM will unroll it, in which case
807+
# # we want to be a little more conservative in terms of register pressure.
808+
# #FIXME: heuristic hack to get some desired behavior.
809+
# if isstaticloop(loop) && length(loop) ≤ 4
810+
# itersym = loop.itersymbol
811+
# if itersym !== u₁loopsym && itersym !== u₂loopsym
812+
# return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
813+
# # return (0.25, 1.0)
814+
# return true
815+
# end
816+
# end
817+
# end
818+
# u₁c, u₂c = child_dependent_u₁u₂(op)
819+
# rp = max(zero(rp), rp - one(rp))
820+
# # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
821+
# (0.25, 1.0)
822+
# cost_vec[1] -= rt
823+
# cost_vec[1] -= 0.5625 * iters
824+
# cost_vec[1] -= 0.5625 * iters / 2
825+
# @show rto, 0.8rt, op
826+
reg_pressure[1] += 0.25rp
827+
cost_vec[2] += rt
828+
reg_pressure[2] += rp
829+
cost_vec[3] += rt
830+
# currently only place `reg_pressure[3]` is updated
831+
reg_pressure[3] += rp
832+
true
833+
else
834+
(1.0, 1.0)
835+
false
836+
end
835837
end
836838
function loadintostore(ls::LoopSet, op::Operation)
837839
isload(op) || return false # leads to bad behavior more than it helps
@@ -888,6 +890,10 @@ function add_constant_offset_load_elmination_cost!(
888890
# we treat this as the unrolled loop getting eliminated is split into 2 parts:
889891
# 1 a non-cost-reduced part, with factor udependent_reduction
890892
# 2 a cost-reduced part, with factor uindependent_increase
893+
if opisininnerloop
894+
u₁c, u₂c = child_dependent_u₁u₂(op)
895+
rp = max(zero(rp), rp - one(rp))
896+
end
891897
if uid == 1 # u₁reduces was false
892898
@assert !u₁reduces
893899
# max negative offset was in the u₁ unroll direction

src/simdfunctionals/map.jl

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ function (m::VmapClosure{NonTemporal,F,D,N,A})(p::Ptr{UInt}) where {NonTemporal,
129129
(offset, stop ) = ThreadingUtilities.load(p, Int, offset)
130130

131131
_vmap_singlethread!(m.f, dest, start, stop, Val{NonTemporal}(), args)
132+
NonTemporal && Threads.atomic_fence()
132133
nothing
133134
end
134135

@@ -221,26 +222,27 @@ function vmap_multithread!(
221222
end
222223
nothing
223224
end
224-
@generated function gc_preserve_vmap!(f::F,
225-
y::AbstractArray,
226-
::Val{NonTemporal},
227-
::Val{Threaded},
228-
args::Vararg{AbstractArray,A}
225+
@generated function gc_preserve_vmap!(
226+
f::F,
227+
y::AbstractArray,
228+
::Val{NonTemporal},
229+
::Val{Threaded},
230+
args::Vararg{AbstractArray,A}
229231
) where {F,A,NonTemporal,Threaded}
230-
m = Threaded ? :vmap_multithread! : :vmap_singlethread!
231-
call = Expr(:call, m, :f, :y, Expr(:call, Expr(:curly, :Val, NonTemporal)))
232-
q = Expr(:block, Expr(:meta, :inline))
233-
gcpres = Expr(:gc_preserve, call)
234-
for a 1:A
235-
arg = Symbol(:arg_,a)
236-
parg = Symbol(:parg_,a)
237-
push!(q.args, Expr(:(=), arg, :(@inbounds args[$a])))#Expr(:ref, :args, a)))
238-
push!(q.args, Expr(:(=), parg, Expr(:call, :preserve_buffer, arg)))
239-
push!(call.args, arg)
240-
push!(gcpres.args, parg)
241-
end
242-
push!(q.args, gcpres, :y)
243-
q
232+
m = Threaded ? :vmap_multithread! : :vmap_singlethread!
233+
call = Expr(:call, m, :f, :y, Expr(:call, Expr(:curly, :Val, NonTemporal)))
234+
q = Expr(:block, Expr(:meta, :inline))
235+
gcpres = Expr(:gc_preserve, call)
236+
for a 1:A
237+
arg = Symbol(:arg_,a)
238+
parg = Symbol(:parg_,a)
239+
push!(q.args, Expr(:(=), arg, :(@inbounds args[$a])))#Expr(:ref, :args, a)))
240+
push!(q.args, Expr(:(=), parg, Expr(:call, :preserve_buffer, arg)))
241+
push!(call.args, arg)
242+
push!(gcpres.args, parg)
243+
end
244+
push!(q.args, gcpres, :y)
245+
q
244246
end
245247

246248

@@ -356,7 +358,6 @@ function vmapnt!(
356358
) where {F,A}
357359
if check_args(y, args...) && all_dense(y, args...)
358360
gc_preserve_vmap!(f, y, Val{true}(), Val{false}(), args...)
359-
Threads.atomic_fence()
360361
else
361362
map!(f, y, args...)
362363
end
@@ -371,7 +372,6 @@ function vmapntt!(
371372
) where {F,A}
372373
if check_args(y, args...) && all_dense(y, args...)
373374
gc_preserve_vmap!(f, y, Val{true}(), Val{true}(), args...)
374-
Threads.atomic_fence()
375375
else
376376
map!(f, y, args...)
377377
end

0 commit comments

Comments
 (0)