1
1
2
2
3
- function add_operation! (ls_new:: LoopSet , included:: Vector{Int} , ls:: LoopSet , op:: Operation )
4
- newid = included[identifier (op)]
5
- iszero (newid) || return operations (ls_new)[newid]
6
- vparents = Operation[]
7
- for opp ∈ parents (op)
8
- # TODO : get it so that
9
- # a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10
- # if iscompute(opp) && (!isstore(op)) # search for stores
11
- # found = false
12
- # for oppp ∈ operations(ls)
13
- # isstore(oppp) || continue
14
- # if first(parents(oppp)) === op
15
- # found = true
16
-
17
- # push!(vparents, add_operation!(ls_new, included, ls, opppp))
18
- # break
19
- # end
20
- # end
21
- # found && continue
22
- # end
23
- push! (vparents, add_operation! (ls_new, included, ls, opp))
3
+ function add_operation! (ls_new:: LoopSet , included:: Vector{Int} , ls:: LoopSet , op:: Operation , ids:: Vector{Int} , issecond:: Bool )
4
+ newid = included[identifier (op)]
5
+ iszero (newid) || return operations (ls_new)[newid]
6
+ vparents = Operation[]
7
+ for opp ∈ parents (op)
8
+ # TODO : get it so that
9
+ # a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10
+ if issecond && (iscompute (opp) & (! isstore (op)))
11
+ found = false
12
+ for opc ∈ children (opp)
13
+ if isstore (opc) && identifier (opc) ∉ ids
14
+ # @show opp opc op
15
+ # replace opp with a load from opc
16
+ opnew = Operation (
17
+ length (operations (ls_new)), name (opp), opc. elementbytes, instruction (:getindex ), memload,
18
+ loopdependencies (opc), reduceddependencies (opc), NOPARENTS, opc. ref, reducedchildren (opc)
19
+ )
20
+ addsetv! (ls_new. includedactualarrays, vptr (opc. ref))
21
+ push! (operations (ls_new), opnew)
22
+ push! (vparents, opnew)
23
+ included[identifier (opp)] = identifier (opnew)
24
+ found = true
25
+ break
26
+ end
27
+ end
28
+ found && continue
24
29
end
25
- opnew = Operation (
26
- length (operations (ls_new)), name (op), op. elementbytes, instruction (op), op. node_type,
27
- loopdependencies (op), reduceddependencies (op), vparents, op. ref, reducedchildren (op)
28
- )
29
- accesses_memory (op) && addsetv! (ls_new. includedactualarrays, vptr (op. ref))
30
- push! (operations (ls_new), opnew)
31
- included[identifier (op)] = identifier (opnew)
32
- opnew
30
+ push! (vparents, add_operation! (ls_new, included, ls, opp, ids, issecond))
31
+ end
32
+ opnew = Operation (
33
+ length (operations (ls_new)), name (op), op. elementbytes, instruction (op), op. node_type,
34
+ loopdependencies (op), reduceddependencies (op), vparents, op. ref, reducedchildren (op)
35
+ )
36
+ accesses_memory (op) && addsetv! (ls_new. includedactualarrays, vptr (op. ref))
37
+ push! (operations (ls_new), opnew)
38
+ included[identifier (op)] = identifier (opnew)
39
+ opnew
33
40
end
34
41
35
42
function append_if_included! (vnew, vold, included)
36
- for (i, v) ∈ vold
37
- id = included[i]
38
- iszero (id) || push! (vnew, (id, v))
39
- end
43
+ for (i, v) ∈ vold
44
+ id = included[i]
45
+ iszero (id) || push! (vnew, (id, v))
46
+ end
40
47
end
41
48
42
- function split_loopset (ls:: LoopSet , ids)
43
- ls_new = LoopSet (:LoopVectorization )
44
- included = zeros (Int, length (operations (ls)))
45
- for i ∈ ids
46
- add_operation! (ls_new, included, ls, operations (ls)[i])
47
- end
48
- for op ∈ operations (ls_new)
49
- for l ∈ loopdependencies (op)
50
- if l ∉ ls_new. loopsymbols
51
- add_loop! (ls_new, getloop (ls, l))
52
- end
53
- end
54
- length (ls_new. loopsymbols) == length (ls. loopsymbols) && break
55
- end
56
- append_if_included! (ls_new. preamble_symsym, ls. preamble_symsym, included)
57
- append_if_included! (ls_new. preamble_symint, ls. preamble_symint, included)
58
- append_if_included! (ls_new. preamble_symfloat, ls. preamble_symfloat, included)
59
- append_if_included! (ls_new. preamble_zeros, ls. preamble_zeros, included)
60
- append_if_included! (ls_new. preamble_funcofeltypes, ls. preamble_funcofeltypes, included)
61
- for i ∈ ls. outer_reductions
62
- id = included[i]
63
- iszero (id) || push! (ls_new. outer_reductions, id)
49
+ function split_loopset (ls:: LoopSet , ids:: Vector{Int} , issecond:: Bool )
50
+ ls_new = LoopSet (:LoopVectorization )
51
+ included = zeros (Int, length (operations (ls)))
52
+ for i ∈ ids
53
+ add_operation! (ls_new, included, ls, operations (ls)[i], ids, issecond)
54
+ end
55
+ for op ∈ operations (ls_new)
56
+ for l ∈ loopdependencies (op)
57
+ if l ∉ ls_new. loopsymbols
58
+ add_loop! (ls_new, getloop (ls, l))
59
+ end
64
60
end
65
- # TODO : allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
66
- # Therefore, it would be profitable to split for this reason.
67
- # However, currently the default assumption in vector width will be wrong, so we should calculate
68
- # it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
69
- # it shouldn't.
70
- # Current behavior is incorrect when VECWIDTH chosen does actually differ between
71
- # split loops and the loops are statically sized, because code gen will then assume it is correct...
72
- l1, l2, l3 = cache_sze (ls)
73
- set_hw! (ls_new, reg_size (ls), reg_count (ls), cache_lnsze (ls), l1, l2, l3)
74
- ls_new. vector_width = ls. vector_width
75
- fill_offset_memop_collection! (ls)
76
- ls_new
61
+ length (ls_new. loopsymbols) == length (ls. loopsymbols) && break
62
+ end
63
+ append_if_included! (ls_new. preamble_symsym, ls. preamble_symsym, included)
64
+ append_if_included! (ls_new. preamble_symint, ls. preamble_symint, included)
65
+ append_if_included! (ls_new. preamble_symfloat, ls. preamble_symfloat, included)
66
+ append_if_included! (ls_new. preamble_zeros, ls. preamble_zeros, included)
67
+ append_if_included! (ls_new. preamble_funcofeltypes, ls. preamble_funcofeltypes, included)
68
+ for i ∈ ls. outer_reductions
69
+ id = included[i]
70
+ iszero (id) || push! (ls_new. outer_reductions, id)
71
+ end
72
+ # TODO : allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
73
+ # Therefore, it would be profitable to split for this reason.
74
+ # However, currently the default assumption in vector width will be wrong, so we should calculate
75
+ # it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
76
+ # it shouldn't.
77
+ # Current behavior is incorrect when VECWIDTH chosen does actually differ between
78
+ # split loops and the loops are statically sized, because code gen will then assume it is correct...
79
+ l1, l2, l3 = cache_sze (ls)
80
+ set_hw! (ls_new, reg_size (ls), reg_count (ls), cache_lnsze (ls), l1, l2, l3)
81
+ ls_new. vector_width = ls. vector_width
82
+ fill_offset_memop_collection! (ls)
83
+ # println("ls_new operations:")
84
+ # display(ls_new.operations)
85
+ # println()
86
+ ls_new
77
87
end
78
88
79
89
function returned_ops (ls:: LoopSet )
@@ -96,14 +106,14 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
96
106
# for (ind,i) ∈ enumerate(split_candidates)
97
107
for (ind,i) ∈ enumerate (split_candidates)
98
108
split_1[1 ] = i
99
- ls_1 = split_loopset (ls, split_1)
109
+ ls_1 = split_loopset (ls, split_1, false )
100
110
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost (ls_1)
101
111
remaining_ops[1 : ind- 1 ] .= @view (split_candidates[1 : ind- 1 ]); remaining_ops[ind: end ] .= @view (split_candidates[ind+ 1 : end ])
102
- ls_2 = split_loopset (ls, remaining_ops)
112
+ ls_2 = split_loopset (ls, remaining_ops, true )
103
113
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost (ls_2)
104
114
# U_1 = T_1 = U_2 = T_2 = 2
105
115
# return ls_1, ls_2
106
- # @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
116
+ # @show cost_1 + cost_2 ≤ 0.9cost_fused, (cost_1 + cost_2) / cost_fused, cost_1, cost_2, cost_fused
107
117
if cost_1 + cost_2 ≤ 0.9 cost_fused
108
118
ls_2_lowered = if length (remaining_ops) > 1
109
119
inline = iszero (inline) ? (shouldinline_1 % Int) : inline
0 commit comments