Skip to content

Commit d1b9f0b

Browse files
committed
Merge branch 'master' of github.com:JuliaSIMD/LoopVectorization.jl
2 parents c89b1c9 + c95a010 commit d1b9f0b

File tree

4 files changed

+107
-76
lines changed

4 files changed

+107
-76
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.56"
4+
version = "0.12.57"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/broadcast.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,8 @@ end
404404
loopsyms = [gensym!(ls, "n") for n 1:N]
405405
add_broadcast_loops!(ls, loopsyms, :dest)
406406
elementbytes = sizeof(T)
407-
add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
408-
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, loopsyms), elementbytes)
407+
add_broadcast!(ls, :destination, :bc, loopsyms, BC, elementbytes)
408+
storeop = add_simple_store!(ls, :destination, ArrayReference(:dest, loopsyms), elementbytes)
409409
doaddref!(ls, storeop)
410410
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
411411
# return ls
@@ -426,8 +426,8 @@ end
426426
pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
427427
add_broadcast_loops!(ls, loopsyms, :dest′)
428428
elementbytes = sizeof(T)
429-
add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
430-
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
429+
add_broadcast!(ls, :destination, :bc, loopsyms, BC, elementbytes)
430+
storeop = add_simple_store!(ls, :destination, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
431431
doaddref!(ls, storeop)
432432
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
433433
Expr(:block, Expr(:meta,:inline), setup_call(ls, :(Base.Broadcast.materialize!(dest′, bc)), LineNumberNode(0), inline, false, u₁, u₂, v, threads%Int, warncheckarg), :dest′)

src/codegen/split_loops.jl

Lines changed: 85 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,94 @@
11

22

3-
function add_operation!(ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation)
4-
newid = included[identifier(op)]
5-
iszero(newid) || return operations(ls_new)[newid]
6-
vparents = Operation[]
7-
for opp parents(op)
8-
# TODO: get it so that
9-
# a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10-
# if iscompute(opp) && (!isstore(op)) # search for stores
11-
# found = false
12-
# for oppp ∈ operations(ls)
13-
# isstore(oppp) || continue
14-
# if first(parents(oppp)) === op
15-
# found = true
16-
17-
# push!(vparents, add_operation!(ls_new, included, ls, opppp))
18-
# break
19-
# end
20-
# end
21-
# found && continue
22-
# end
23-
push!(vparents, add_operation!(ls_new, included, ls, opp))
3+
function add_operation!(ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation, ids::Vector{Int}, issecond::Bool)
4+
newid = included[identifier(op)]
5+
iszero(newid) || return operations(ls_new)[newid]
6+
vparents = Operation[]
7+
for opp parents(op)
8+
# TODO: get it so that
9+
# a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10+
if issecond && (iscompute(opp) & (!isstore(op)))
11+
found = false
12+
for opc children(opp)
13+
if isstore(opc) && identifier(opc) ids
14+
# @show opp opc op
15+
# replace opp with a load from opc
16+
parentsopc = parents(opc)
17+
parentsnew = length(parentsopc) > 1 ? Operation[] : NOPARENTS
18+
opnew = Operation(
19+
length(operations(ls_new)), name(opp), opc.elementbytes, instruction(:getindex), memload,
20+
loopdependencies(opc), reduceddependencies(opc), parentsnew, opc.ref, reducedchildren(opc)
21+
)
22+
addsetv!(ls_new.includedactualarrays, vptr(opc.ref))
23+
push!(operations(ls_new), opnew)
24+
push!(vparents, opnew)
25+
for i 2:length(parentsopc)
26+
push!(parentsnew, add_operation!(ls_new, included, ls, parentsopc[i], ids, issecond))
27+
end
28+
included[identifier(opp)] = identifier(opnew)
29+
found = true
30+
break
31+
end
32+
end
33+
found && continue
2434
end
25-
opnew = Operation(
26-
length(operations(ls_new)), name(op), op.elementbytes, instruction(op), op.node_type,
27-
loopdependencies(op), reduceddependencies(op), vparents, op.ref, reducedchildren(op)
28-
)
29-
accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
30-
push!(operations(ls_new), opnew)
31-
included[identifier(op)] = identifier(opnew)
32-
opnew
35+
push!(vparents, add_operation!(ls_new, included, ls, opp, ids, issecond))
36+
end
37+
opnew = Operation(
38+
length(operations(ls_new)), name(op), op.elementbytes, instruction(op), op.node_type,
39+
loopdependencies(op), reduceddependencies(op), vparents, op.ref, reducedchildren(op)
40+
)
41+
accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
42+
push!(operations(ls_new), opnew)
43+
included[identifier(op)] = identifier(opnew)
44+
opnew
3345
end
3446

3547
function append_if_included!(vnew, vold, included)
36-
for (i, v) vold
37-
id = included[i]
38-
iszero(id) || push!(vnew, (id, v))
39-
end
48+
for (i, v) vold
49+
id = included[i]
50+
iszero(id) || push!(vnew, (id, v))
51+
end
4052
end
4153

42-
function split_loopset(ls::LoopSet, ids)
43-
ls_new = LoopSet(:LoopVectorization)
44-
included = zeros(Int, length(operations(ls)))
45-
for i ids
46-
add_operation!(ls_new, included, ls, operations(ls)[i])
47-
end
48-
for op operations(ls_new)
49-
for l loopdependencies(op)
50-
if l ls_new.loopsymbols
51-
add_loop!(ls_new, getloop(ls, l))
52-
end
53-
end
54-
length(ls_new.loopsymbols) == length(ls.loopsymbols) && break
55-
end
56-
append_if_included!(ls_new.preamble_symsym, ls.preamble_symsym, included)
57-
append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
58-
append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
59-
append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
60-
append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
61-
for i ls.outer_reductions
62-
id = included[i]
63-
iszero(id) || push!(ls_new.outer_reductions, id)
54+
function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
55+
ls_new = LoopSet(:LoopVectorization)
56+
included = zeros(Int, length(operations(ls)))
57+
for i ids
58+
add_operation!(ls_new, included, ls, operations(ls)[i], ids, issecond)
59+
end
60+
for op operations(ls_new)
61+
for l loopdependencies(op)
62+
if l ls_new.loopsymbols
63+
add_loop!(ls_new, getloop(ls, l))
64+
end
6465
end
65-
# TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
66-
# Therefore, it would be profitable to split for this reason.
67-
# However, currently the default assumption in vector width will be wrong, so we should calculate
68-
# it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
69-
# it shouldn't.
70-
# Current behavior is incorrect when VECWIDTH chosen does actually differ between
71-
# split loops and the loops are statically sized, because code gen will then assume it is correct...
72-
l1, l2, l3 = cache_sze(ls)
73-
set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
74-
ls_new.vector_width = ls.vector_width
75-
fill_offset_memop_collection!(ls)
76-
ls_new
66+
length(ls_new.loopsymbols) == length(ls.loopsymbols) && break
67+
end
68+
append_if_included!(ls_new.preamble_symsym, ls.preamble_symsym, included)
69+
append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
70+
append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
71+
append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
72+
append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
73+
for i ls.outer_reductions
74+
id = included[i]
75+
iszero(id) || push!(ls_new.outer_reductions, id)
76+
end
77+
# TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
78+
# Therefore, it would be profitable to split for this reason.
79+
# However, currently the default assumption in vector width will be wrong, so we should calculate
80+
# it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
81+
# it shouldn't.
82+
# Current behavior is incorrect when VECWIDTH chosen does actually differ between
83+
# split loops and the loops are statically sized, because code gen will then assume it is correct...
84+
l1, l2, l3 = cache_sze(ls)
85+
set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
86+
ls_new.vector_width = ls.vector_width
87+
fill_offset_memop_collection!(ls)
88+
# println("ls_new operations:")
89+
# display(ls_new.operations)
90+
# println()
91+
ls_new
7792
end
7893

7994
function returned_ops(ls::LoopSet)
@@ -96,14 +111,14 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
96111
# for (ind,i) ∈ enumerate(split_candidates)
97112
for (ind,i) enumerate(split_candidates)
98113
split_1[1] = i
99-
ls_1 = split_loopset(ls, split_1)
114+
ls_1 = split_loopset(ls, split_1, false)
100115
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
101116
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
102-
ls_2 = split_loopset(ls, remaining_ops)
117+
ls_2 = split_loopset(ls, remaining_ops, true)
103118
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
104119
# U_1 = T_1 = U_2 = T_2 = 2
105120
# return ls_1, ls_2
106-
# @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
121+
# @show cost_1 + cost_2 ≤ 0.9cost_fused, (cost_1 + cost_2) / cost_fused, cost_1, cost_2, cost_fused
107122
if cost_1 + cost_2 0.9cost_fused
108123
ls_2_lowered = if length(remaining_ops) > 1
109124
inline = iszero(inline) ? (shouldinline_1 % Int) : inline

test/special.jl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,20 @@
344344
end
345345
end
346346
end
347+
function sin_sum_3loop_split!(u, x, y, z)
348+
sx = similar(x); sy = similar(y); sz = similar(z);
349+
@turbo for k in 1:length(z)
350+
for j in 1:length(y)
351+
for i in 1:length(x)
352+
sxi = sin(x[i])
353+
syj = sin(y[j])
354+
szk = sin(z[k])
355+
sx[i] = sxi; sy[j] = syj; sz[k] = szk;
356+
u[i, j, k] = sxi + syj + szk
357+
end
358+
end
359+
end
360+
end
347361

348362
for T (Float32, Float64)
349363
@show T, @__LINE__
@@ -428,6 +442,8 @@
428442
u = zeros(itot+8, itot+8, itot+8);
429443
uv = @view u[5:5+itot-1, 5:5+itot-1, 5:5+itot-1];
430444
sin_sum_3loop!(uv, x, y, z);
431-
@test uv (identity(sin.(x)) .+ identity((sin.(y))')) .+ identity(reshape(sin.(z), (1, 1, length(z))))
445+
uv2 = @view similar(u)[5:5+itot-1, 5:5+itot-1, 5:5+itot-1];
446+
sin_sum_3loop_split!(uv2, x, y, z);
447+
@test uv uv2 (identity(sin.(x)) .+ identity((sin.(y))')) .+ identity(reshape(sin.(z), (1, 1, length(z))))
432448
end
433449
end

0 commit comments

Comments
 (0)