1
1
2
2
3
- function add_operation! (ls_new:: LoopSet , included:: Vector{Int} , ls:: LoopSet , op:: Operation )
4
- newid = included[identifier (op)]
5
- iszero (newid) || return operations (ls_new)[newid]
6
- vparents = Operation[]
7
- for opp ∈ parents (op)
8
- # TODO : get it so that
9
- # a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10
- # if iscompute(opp) && (!isstore(op)) # search for stores
11
- # found = false
12
- # for oppp ∈ operations(ls)
13
- # isstore(oppp) || continue
14
- # if first(parents(oppp)) === op
15
- # found = true
16
-
17
- # push!(vparents, add_operation!(ls_new, included, ls, opppp))
18
- # break
19
- # end
20
- # end
21
- # found && continue
22
- # end
23
- push! (vparents, add_operation! (ls_new, included, ls, opp))
3
+ function add_operation! (ls_new:: LoopSet , included:: Vector{Int} , ls:: LoopSet , op:: Operation , ids:: Vector{Int} , issecond:: Bool )
4
+ newid = included[identifier (op)]
5
+ iszero (newid) || return operations (ls_new)[newid]
6
+ vparents = Operation[]
7
+ for opp ∈ parents (op)
8
+ # TODO : get it so that
9
+ # a[i] = f(a[i]) will split into one loop computing and storing f(a[i]), and the other loading from that storage if it needs it.
10
+ if issecond && (iscompute (opp) & (! isstore (op)))
11
+ found = false
12
+ for opc ∈ children (opp)
13
+ if isstore (opc) && identifier (opc) ∉ ids
14
+ # @show opp opc op
15
+ # replace opp with a load from opc
16
+ parentsopc = parents (opc)
17
+ parentsnew = length (parentsopc) > 1 ? Operation[] : NOPARENTS
18
+ opnew = Operation (
19
+ length (operations (ls_new)), name (opp), opc. elementbytes, instruction (:getindex ), memload,
20
+ loopdependencies (opc), reduceddependencies (opc), parentsnew, opc. ref, reducedchildren (opc)
21
+ )
22
+ addsetv! (ls_new. includedactualarrays, vptr (opc. ref))
23
+ push! (operations (ls_new), opnew)
24
+ push! (vparents, opnew)
25
+ for i ∈ 2 : length (parentsopc)
26
+ push! (parentsnew, add_operation! (ls_new, included, ls, parentsopc[i], ids, issecond))
27
+ end
28
+ included[identifier (opp)] = identifier (opnew)
29
+ found = true
30
+ break
31
+ end
32
+ end
33
+ found && continue
24
34
end
25
- opnew = Operation (
26
- length (operations (ls_new)), name (op), op. elementbytes, instruction (op), op. node_type,
27
- loopdependencies (op), reduceddependencies (op), vparents, op. ref, reducedchildren (op)
28
- )
29
- accesses_memory (op) && addsetv! (ls_new. includedactualarrays, vptr (op. ref))
30
- push! (operations (ls_new), opnew)
31
- included[identifier (op)] = identifier (opnew)
32
- opnew
35
+ push! (vparents, add_operation! (ls_new, included, ls, opp, ids, issecond))
36
+ end
37
+ opnew = Operation (
38
+ length (operations (ls_new)), name (op), op. elementbytes, instruction (op), op. node_type,
39
+ loopdependencies (op), reduceddependencies (op), vparents, op. ref, reducedchildren (op)
40
+ )
41
+ accesses_memory (op) && addsetv! (ls_new. includedactualarrays, vptr (op. ref))
42
+ push! (operations (ls_new), opnew)
43
+ included[identifier (op)] = identifier (opnew)
44
+ opnew
33
45
end
34
46
35
47
function append_if_included! (vnew, vold, included)
36
- for (i, v) ∈ vold
37
- id = included[i]
38
- iszero (id) || push! (vnew, (id, v))
39
- end
48
+ for (i, v) ∈ vold
49
+ id = included[i]
50
+ iszero (id) || push! (vnew, (id, v))
51
+ end
40
52
end
41
53
42
- function split_loopset (ls:: LoopSet , ids)
43
- ls_new = LoopSet (:LoopVectorization )
44
- included = zeros (Int, length (operations (ls)))
45
- for i ∈ ids
46
- add_operation! (ls_new, included, ls, operations (ls)[i])
47
- end
48
- for op ∈ operations (ls_new)
49
- for l ∈ loopdependencies (op)
50
- if l ∉ ls_new. loopsymbols
51
- add_loop! (ls_new, getloop (ls, l))
52
- end
53
- end
54
- length (ls_new. loopsymbols) == length (ls. loopsymbols) && break
55
- end
56
- append_if_included! (ls_new. preamble_symsym, ls. preamble_symsym, included)
57
- append_if_included! (ls_new. preamble_symint, ls. preamble_symint, included)
58
- append_if_included! (ls_new. preamble_symfloat, ls. preamble_symfloat, included)
59
- append_if_included! (ls_new. preamble_zeros, ls. preamble_zeros, included)
60
- append_if_included! (ls_new. preamble_funcofeltypes, ls. preamble_funcofeltypes, included)
61
- for i ∈ ls. outer_reductions
62
- id = included[i]
63
- iszero (id) || push! (ls_new. outer_reductions, id)
54
+ function split_loopset (ls:: LoopSet , ids:: Vector{Int} , issecond:: Bool )
55
+ ls_new = LoopSet (:LoopVectorization )
56
+ included = zeros (Int, length (operations (ls)))
57
+ for i ∈ ids
58
+ add_operation! (ls_new, included, ls, operations (ls)[i], ids, issecond)
59
+ end
60
+ for op ∈ operations (ls_new)
61
+ for l ∈ loopdependencies (op)
62
+ if l ∉ ls_new. loopsymbols
63
+ add_loop! (ls_new, getloop (ls, l))
64
+ end
64
65
end
65
- # TODO : allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
66
- # Therefore, it would be profitable to split for this reason.
67
- # However, currently the default assumption in vector width will be wrong, so we should calculate
68
- # it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
69
- # it shouldn't.
70
- # Current behavior is incorrect when VECWIDTH chosen does actually differ between
71
- # split loops and the loops are statically sized, because code gen will then assume it is correct...
72
- l1, l2, l3 = cache_sze (ls)
73
- set_hw! (ls_new, reg_size (ls), reg_count (ls), cache_lnsze (ls), l1, l2, l3)
74
- ls_new. vector_width = ls. vector_width
75
- fill_offset_memop_collection! (ls)
76
- ls_new
66
+ length (ls_new. loopsymbols) == length (ls. loopsymbols) && break
67
+ end
68
+ append_if_included! (ls_new. preamble_symsym, ls. preamble_symsym, included)
69
+ append_if_included! (ls_new. preamble_symint, ls. preamble_symint, included)
70
+ append_if_included! (ls_new. preamble_symfloat, ls. preamble_symfloat, included)
71
+ append_if_included! (ls_new. preamble_zeros, ls. preamble_zeros, included)
72
+ append_if_included! (ls_new. preamble_funcofeltypes, ls. preamble_funcofeltypes, included)
73
+ for i ∈ ls. outer_reductions
74
+ id = included[i]
75
+ iszero (id) || push! (ls_new. outer_reductions, id)
76
+ end
77
+ # TODO : allow them to differ. E.g., non-AVX2 x86 cpus don't have efficient integer calculations
78
+ # Therefore, it would be profitable to split for this reason.
79
+ # However, currently the default assumption in vector width will be wrong, so we should calculate
80
+ # it correctly (like ls.vector_width); wrong (too high) value will encourage splitting when
81
+ # it shouldn't.
82
+ # Current behavior is incorrect when VECWIDTH chosen does actually differ between
83
+ # split loops and the loops are statically sized, because code gen will then assume it is correct...
84
+ l1, l2, l3 = cache_sze (ls)
85
+ set_hw! (ls_new, reg_size (ls), reg_count (ls), cache_lnsze (ls), l1, l2, l3)
86
+ ls_new. vector_width = ls. vector_width
87
+ fill_offset_memop_collection! (ls)
88
+ # println("ls_new operations:")
89
+ # display(ls_new.operations)
90
+ # println()
91
+ ls_new
77
92
end
78
93
79
94
function returned_ops (ls:: LoopSet )
@@ -96,14 +111,14 @@ function lower_and_split_loops(ls::LoopSet, inline::Int)
96
111
# for (ind,i) ∈ enumerate(split_candidates)
97
112
for (ind,i) ∈ enumerate (split_candidates)
98
113
split_1[1 ] = i
99
- ls_1 = split_loopset (ls, split_1)
114
+ ls_1 = split_loopset (ls, split_1, false )
100
115
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost (ls_1)
101
116
remaining_ops[1 : ind- 1 ] .= @view (split_candidates[1 : ind- 1 ]); remaining_ops[ind: end ] .= @view (split_candidates[ind+ 1 : end ])
102
- ls_2 = split_loopset (ls, remaining_ops)
117
+ ls_2 = split_loopset (ls, remaining_ops, true )
103
118
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost (ls_2)
104
119
# U_1 = T_1 = U_2 = T_2 = 2
105
120
# return ls_1, ls_2
106
- # @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
121
+ # @show cost_1 + cost_2 ≤ 0.9cost_fused, (cost_1 + cost_2) / cost_fused, cost_1, cost_2, cost_fused
107
122
if cost_1 + cost_2 ≤ 0.9 cost_fused
108
123
ls_2_lowered = if length (remaining_ops) > 1
109
124
inline = iszero (inline) ? (shouldinline_1 % Int) : inline
0 commit comments