Skip to content

Commit 45d281e

Browse files
committed
Allow loading from stores for splitting
1 parent 8a3c3eb commit 45d281e

File tree

3 files changed

+98
-72
lines changed

3 files changed

+98
-72
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.56"
4+
version = "0.12.57"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/codegen/split_loops.jl

Lines changed: 80 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,89 @@
11

22

3-
function add_operation!(ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation)
4-
newid = included[identifier(op)]
5-
iszero(newid) || return operations(ls_new)[newid]
6-
vparents = Operation[]
7-
for opp parents(op)
8-
# TODO: get it so that
9-
# a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10-
# if iscompute(opp) && (!isstore(op)) # search for stores
11-
# found = false
12-
# for oppp ∈ operations(ls)
13-
# isstore(oppp) || continue
14-
# if first(parents(oppp)) === op
15-
# found = true
16-
17-
# push!(vparents, add_operation!(ls_new, included, ls, opppp))
18-
# break
19-
# end
20-
# end
21-
# found && continue
22-
# end
23-
push!(vparents, add_operation!(ls_new, included, ls, opp))
3+
function add_operation!(ls_new::LoopSet, included::Vector{Int}, ls::LoopSet, op::Operation, ids::Vector{Int}, issecond::Bool)
4+
newid = included[identifier(op)]
5+
iszero(newid) || return operations(ls_new)[newid]
6+
vparents = Operation[]
7+
for opp parents(op)
8+
# TODO: get it so that
9+
# a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10+
if issecond && (iscompute(opp) & (!isstore(op)))
11+
found = false
12+
for opc children(opp)
13+
if isstore(opc) && identifier(opc) ids
14+
# @show opp opc op
15+
# replace opp with a load from opc
16+
opnew = Operation(
17+
length(operations(ls_new)), name(opp), opc.elementbytes, instruction(:getindex), memload,
18+
loopdependencies(opc), reduceddependencies(opc), NOPARENTS, opc.ref, reducedchildren(opc)
19+
)
20+
addsetv!(ls_new.includedactualarrays, vptr(opc.ref))
21+
push!(operations(ls_new), opnew)
22+
push!(vparents, opnew)
23+
included[identifier(opp)] = identifier(opnew)
24+
found = true
25+
break
26+
end
27+
end
28+
found && continue
2429
end
25-
opnew = Operation(
26-
length(operations(ls_new)), name(op), op.elementbytes, instruction(op), op.node_type,
27-
loopdependencies(op), reduceddependencies(op), vparents, op.ref, reducedchildren(op)
28-
)
29-
accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
30-
push!(operations(ls_new), opnew)
31-
included[identifier(op)] = identifier(opnew)
32-
opnew
30+
push!(vparents, add_operation!(ls_new, included, ls, opp, ids, issecond))
31+
end
32+
opnew = Operation(
33+
length(operations(ls_new)), name(op), op.elementbytes, instruction(op), op.node_type,
34+
loopdependencies(op), reduceddependencies(op), vparents, op.ref, reducedchildren(op)
35+
)
36+
accesses_memory(op) && addsetv!(ls_new.includedactualarrays, vptr(op.ref))
37+
push!(operations(ls_new), opnew)
38+
included[identifier(op)] = identifier(opnew)
39+
opnew
3340
end
3441

3542
function append_if_included!(vnew, vold, included)
36-
for (i, v) vold
37-
id = included[i]
38-
iszero(id) || push!(vnew, (id, v))
39-
end
43+
for (i, v) vold
44+
id = included[i]
45+
iszero(id) || push!(vnew, (id, v))
46+
end
4047
end
4148

42-
function split_loopset(ls::LoopSet, ids)
43-
ls_new = LoopSet(:LoopVectorization)
44-
included = zeros(Int, length(operations(ls)))
45-
for i ids
46-
add_operation!(ls_new, included, ls, operations(ls)[i])
47-
end
48-
for op operations(ls_new)
49-
for l loopdependencies(op)
50-
if l ls_new.loopsymbols
51-
add_loop!(ls_new, getloop(ls, l))
52-
end
53-
end
54-
length(ls_new.loopsymbols) == length(ls.loopsymbols) && break
55-
end
56-
append_if_included!(ls_new.preamble_symsym, ls.preamble_symsym, included)
57-
append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
58-
append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
59-
append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
60-
append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
61-
for i ls.outer_reductions
62-
id = included[i]
63-
iszero(id) || push!(ls_new.outer_reductions, id)
49+
function split_loopset(ls::LoopSet, ids::Vector{Int}, issecond::Bool)
50+
ls_new = LoopSet(:LoopVectorization)
51+
included = zeros(Int, length(operations(ls)))
52+
for i ids
53+
add_operation!(ls_new, included, ls, operations(ls)[i], ids, issecond)
54+
end
55+
for op operations(ls_new)
56+
for l loopdependencies(op)
57+
if l ls_new.loopsymbols
58+
add_loop!(ls_new, getloop(ls, l))
59+
end
6460
end
65-
# TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
66-
# Therefore, it would be profitable to split for this reason.
67-
# However, currently the default assumption in vector width will be wrong, so we should calculate
68-
# it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
69-
# it shouldn't.
70-
# Current behavior is incorrect when VECWIDTH chosen does actually differ between
71-
# split loops and the loops are statically sized, because code gen will then assume it is correct...
72-
l1, l2, l3 = cache_sze(ls)
73-
set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
74-
ls_new.vector_width = ls.vector_width
75-
fill_offset_memop_collection!(ls)
76-
ls_new
61+
length(ls_new.loopsymbols) == length(ls.loopsymbols) && break
62+
end
63+
append_if_included!(ls_new.preamble_symsym, ls.preamble_symsym, included)
64+
append_if_included!(ls_new.preamble_symint, ls.preamble_symint, included)
65+
append_if_included!(ls_new.preamble_symfloat, ls.preamble_symfloat, included)
66+
append_if_included!(ls_new.preamble_zeros, ls.preamble_zeros, included)
67+
append_if_included!(ls_new.preamble_funcofeltypes, ls.preamble_funcofeltypes, included)
68+
for i ls.outer_reductions
69+
id = included[i]
70+
iszero(id) || push!(ls_new.outer_reductions, id)
71+
end
72+
# TODO: allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
73+
# Therefore, it would be profitable to split for this reason.
74+
# However, currently the default assumption in vector width will be wrong, so we should calculate
75+
# it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
76+
# it shouldn't.
77+
# Current behavior is incorrect when VECWIDTH chosen does actually differ between
78+
# split loops and the loops are statically sized, because code gen will then assume it is correct...
79+
l1, l2, l3 = cache_sze(ls)
80+
set_hw!(ls_new, reg_size(ls), reg_count(ls), cache_lnsze(ls), l1, l2, l3)
81+
ls_new.vector_width = ls.vector_width
82+
fill_offset_memop_collection!(ls)
83+
# println("ls_new operations:")
84+
# display(ls_new.operations)
85+
# println()
86+
ls_new
7787
end
7888

7989
function returned_ops(ls::LoopSet)
@@ -96,14 +106,14 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
96106
# for (ind,i) ∈ enumerate(split_candidates)
97107
for (ind,i) enumerate(split_candidates)
98108
split_1[1] = i
99-
ls_1 = split_loopset(ls, split_1)
109+
ls_1 = split_loopset(ls, split_1, false)
100110
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
101111
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
102-
ls_2 = split_loopset(ls, remaining_ops)
112+
ls_2 = split_loopset(ls, remaining_ops, true)
103113
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
104114
# U_1 = T_1 = U_2 = T_2 = 2
105115
# return ls_1, ls_2
106-
# @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
116+
# @show cost_1 + cost_2 ≤ 0.9cost_fused, (cost_1 + cost_2) / cost_fused, cost_1, cost_2, cost_fused
107117
if cost_1 + cost_2 0.9cost_fused
108118
ls_2_lowered = if length(remaining_ops) > 1
109119
inline = iszero(inline) ? (shouldinline_1 % Int) : inline

test/special.jl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,20 @@
344344
end
345345
end
346346
end
347+
function sin_sum_3loop_split!(u, x, y, z)
348+
sx = similar(x); sy = similar(y); sz = similar(z);
349+
@turbo for k in 1:length(z)
350+
for j in 1:length(y)
351+
for i in 1:length(x)
352+
sxi = sin(x[i])
353+
syj = sin(y[j])
354+
szk = sin(z[k])
355+
sx[i] = sxi; sy[j] = syj; sz[k] = szk;
356+
u[i, j, k] = sxi + syj + szk
357+
end
358+
end
359+
end
360+
end
347361

348362
for T (Float32, Float64)
349363
@show T, @__LINE__
@@ -428,6 +442,8 @@
428442
u = zeros(itot+8, itot+8, itot+8);
429443
uv = @view u[5:5+itot-1, 5:5+itot-1, 5:5+itot-1];
430444
sin_sum_3loop!(uv, x, y, z);
431-
@test uv (identity(sin.(x)) .+ identity((sin.(y))')) .+ identity(reshape(sin.(z), (1, 1, length(z))))
445+
uv2 = @view similar(u)[5:5+itot-1, 5:5+itot-1, 5:5+itot-1];
446+
sin_sum_3loop_split!(uv2, x, y, z);
447+
@test uv uv2 (identity(sin.(x)) .+ identity((sin.(y))')) .+ identity(reshape(sin.(z), (1, 1, length(z))))
432448
end
433449
end

0 commit comments

Comments
 (0)