Skip to content

Commit 7473acc

Browse files
committed
Some progress/updates.
1 parent b2ec589 commit 7473acc

File tree

8 files changed

+64
-15
lines changed

8 files changed

+64
-15
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12"
4+
version = "0.12.0"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -11,6 +11,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1111
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
1212
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1313
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
14+
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
1415
ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
1516
UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
1617
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
@@ -22,6 +23,7 @@ IfElse = "0.1"
2223
OffsetArrays = "1.4.1, 1.5"
2324
Requires = "1"
2425
SLEEFPirates = "0.6.7"
26+
Static = "0.2"
2527
ThreadingUtilities = "0.2.3"
2628
UnPack = "1"
2729
VectorizationBase = "0.19"

src/LoopVectorization.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module LoopVectorization
22

3-
3+
using Static: StaticInt, gt
44
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
55
using VectorizationBase: register_size, register_count, cache_linesize, has_opmask_registers,
66
mask, pick_vector_width, MM, AbstractMask, data, grouped_strided_pointer,
@@ -18,7 +18,8 @@ using VectorizationBase: register_size, register_count, cache_linesize, has_opma
1818
contract_max, collapse_max,
1919
contract_min, collapse_min,
2020
contract_and, collapse_and,
21-
contract_or, collapse_or
21+
contract_or, collapse_or,
22+
num_threads, num_cores
2223

2324

2425
using IfElse: ifelse
@@ -76,6 +77,7 @@ include("codegen/lower_load.jl")
7677
include("codegen/lower_store.jl")
7778
include("codegen/lowering.jl")
7879
include("codegen/split_loops.jl")
80+
include("codegen/lower_threads.jl")
7981
include("condense_loopset.jl")
8082
include("reconstruct_loopset.jl")
8183
include("constructors.jl")

src/codegen/loopstartstopmanager.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,13 +437,12 @@ function offset_ptr(
437437
end
438438
Expr(:(=), vptr(ar), Expr(:call, lv(:gesp), vptr(ar), gespinds))
439439
end
440-
function incrementloopcounter(ls::LoopSet, us::UnrollSpecification, n::Int, UF::Int)
440+
function incrementloopcounter!(q::Expr, ls::LoopSet, us::UnrollSpecification, n::Int, UF::Int)
441441
@unpack u₁loopnum, u₂loopnum, vloopnum, u₁, u₂ = us
442442
lssm = ls.lssm[]
443443
ptrdefs = lssm.incrementedptrs[n]
444444
looporder = names(ls)
445445
loopsym = looporder[n]
446-
q = Expr(:block)
447446
termind = lssm.terminators[n]
448447
loop = getloop(ls, n)
449448
if iszero(termind) # increment liv
@@ -453,7 +452,7 @@ function incrementloopcounter(ls::LoopSet, us::UnrollSpecification, n::Int, UF::
453452
offsetinds = indices_calculated_by_pointer_offsets(ls, ar)
454453
push!(q.args, offset_ptr(ar, us, loopsym, n, UF, offsetinds, loop))
455454
end
456-
q
455+
nothing
457456
end
458457
function terminatecondition(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask::Bool, UF::Int)
459458
lssm = ls.lssm[]

src/codegen/lower_compute.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ function ifelselastexpr(hasf::Bool, M::Int, vargtypes, K::Int, S::Int, maskearly
106106
for k 1:K
107107
lengths[k] = l = vecunrolllen(vargtypes[k])
108108
if hasf
109-
if l == -1
110-
push!(q.args, :($(vargs[k]) = getfield(vargs, $k, false)))
111-
else
112-
push!(q.args, :($(vargs[k]) = data(getfield(vargs, $k, false))))
109+
gfvarg = Expr(:call, GlobalRef(Core, :getfield), :vargs, k, false)
110+
if l -1 # VecUnroll
111+
gfvarg = Expr(:call, GlobalRef(Core, :getfield), gfvarg, 1, false)
113112
end
114-
elseif l != -1
113+
push!(q.args, Expr(:(=), vargs[k], gfvarg))
114+
elseif l -1
115115
varg = vargs[k]
116116
vargs[k] = dvarg = Symbol(:d, varg)
117117
push!(q.args, :($dvarg = data($varg)))

src/codegen/lower_threads.jl

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,58 @@ struct AVX{UNROLL,OPS,ARF,AM,LPSYM,LB,V} <: Function end
33

44
# This should call the same `_avx_!(Val{UNROLL}(), Val{OPS}(), Val{ARF}(), Val{AM}(), Val{LPSYM}(), _vargs)` as normal so that this
55
# hopefully shouldn't add much to compile time.
6+
67
function (::AVX{UNROLL,OPS,ARF,AM,LPSYM,LB,V})(p::Ptr{UInt}) where {UNROLL,OPS,ARF,AM,LPSYM,LB,V}
78
_vargs = ThreadingUtilities.load(p, Tuple{LB,V}, 1)
89
ret = _avx_!(Val{UNROLL}(), Val{OPS}(), Val{ARF}(), Val{AM}(), Val{LPSYM}(), _vargs)
910
ThreadingUtilities.store!(p, ret, 7)
1011
nothing
1112
end
1213

14+
# function approx_cbrt(x)
15+
# s = significand(x)
16+
# e = exponent(x)
17+
18+
# # 40 + 0.00020833333333333335*(x-64000) -2.1701388888888896e-9*(x-64000)^2*0.5 + 5.6514033564814844e-14 * (x-64000)^3/6
19+
# end
20+
21+
function choose_threads(::StaticInt{C}, x) where {C}
22+
nt = ifelse(gt(num_threads(), num_cores()), num_cores(), num_threads())
23+
fx = Base.uitofp(Float64, x)
24+
min(Base.fptosi(Int, Base.ceil_llvm(5.0852672001495816e-11*C*Base.sqrt_llvm(fx))), nt)
25+
end
26+
27+
function thread_single_loop_expr(ls::LoopSet, UNROLL, id)
28+
29+
end
30+
function thread_multiple_loop_expr(ls::LoopSet, UNROLL, valid_thread_loop)
1331

32+
end
1433

15-
function _avx_threads!()
34+
function avx_threads_expr(ls::LoopSet, UNROLL)
35+
order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls)
36+
valid_thread_loop = fill(true, length(order))
37+
for op operations(ls)
38+
if isstore(op) && (length(reduceddependencies(op)) > 0)
39+
for reduceddep reduceddependencies(op)
40+
for (i,o) enumerate(order)
41+
if o === reduceddep
42+
valid_thread_loop[i] = false
43+
end
44+
end
45+
end
46+
end
47+
end
48+
num_candiates = sum(valid_thread_loop)
49+
# num_to_thread = min(num_candiates, 2)
50+
# candidate_ids =
51+
if num_candiates == 0
52+
avx_body(ls, UNROLL)
53+
elseif num_candiates == 1
54+
thread_single_loop_expr(ls, UNROLL, findfirst(isone, valid_thread_loop)::Int)
55+
else
56+
thread_multiple_loop_expr(ls, UNROLL, vald_thread_loop)
57+
end
1658

1759
end
1860

src/codegen/lowering.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ function lower_block(
8282
end
8383
loopsym = order[n]
8484
# if n > 1 || iszero(ls.align_loops[])
85-
push!(blockq.args, incrementloopcounter(ls, us, n, UF))
85+
incrementloopcounter!(blockq, ls, us, n, UF)
8686
# else
8787
# loopsym = names(ls)[n]
8888
# push!(blockq.args, Expr(:(=), loopsym, Expr(:call, lv(:vadd_fast), loopsym, Symbol("##ALIGNMENT#STEP##"))))

src/modeling/graphs.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,11 @@ Base.length(ls::LoopSet, s::Symbol) = length(getloop(ls, s))
607607
function init_loop_map!(ls::LoopSet)
608608
@unpack loopordermap = ls
609609
order = names(ls)
610-
sortperm!(resize!(loopordermap, length(order)), order, by = x -> getloopid(ls,x))
610+
orderids = Vector{Int}(undef, length(order))
611+
for (i,o) enumerate(order)
612+
orderids[i] = getloopid(ls,o)
613+
end
614+
sortperm!(resize!(loopordermap, length(order)), orderids)
611615
nothing
612616
end
613617

src/reconstruct_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
570570
- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
571571
"""
572572
@generated function _avx_!(::Val{UNROLL}, ::Val{OPS}, ::Val{ARF}, ::Val{AM}, ::Val{LPSYM}, _vargs::Tuple{LB,V}) where {UNROLL, OPS, ARF, AM, LPSYM, LB, V}
573-
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
573+
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
574574
ls = _avx_loopset(OPS, ARF, AM, LPSYM, LB.parameters, V.parameters)
575575
# return @show avx_body(ls, UNROLL)
576576
# @show UNROLL, OPS, ARF, AM, LPSYM, LB

0 commit comments

Comments
 (0)