Skip to content

Commit d96dbb3

Browse files
committed
Calculate reducedependencies in a slightly less stupid way, as the difference of the loopdependencies of the parent and of itself.
1 parent 758a885 commit d96dbb3

19 files changed

+293
-187
lines changed

src/LoopVectorization.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector
66
maybestaticlength, maybestaticsize, staticm1, subsetview, vzero,
77
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
88
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
9-
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
10-
vmullog2, vmullog10, vdivlog2, vdivlog2add, vdivlog10, vdivlog10add, vfmaddaddone
9+
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod#,
10+
# vmullog2, vmullog10, vdivlog2, vdivlog2add, vdivlog10, vdivlog10add, vfmaddaddone
1111
using Base.Broadcast: Broadcasted, DefaultArrayStyle
1212
using LinearAlgebra: Adjoint, Transpose
1313
using MacroTools: prewalk, postwalk
@@ -22,6 +22,7 @@ include("map.jl")
2222
include("costs.jl")
2323
include("operations.jl")
2424
include("graphs.jl")
25+
include("operation_evaluation_order.jl")
2526
include("memory_ops_common.jl")
2627
include("add_loads.jl")
2728
include("add_stores.jl")
@@ -40,6 +41,7 @@ include("condense_loopset.jl")
4041
include("reconstruct_loopset.jl")
4142
include("constructors.jl")
4243

44+
4345
include("precompile.jl")
4446
_precompile_()
4547

src/add_compute.jl

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,23 @@ function mergesetdiffv!(
2121
end
2222
nothing
2323
end
24+
# Everything in arg2 (s1) that isn't in arg3 (s2) is added to arg1 (s3)
2425
function setdiffv!(s3::AbstractVector{T}, s1::AbstractVector{T}, s2::AbstractVector{T}) where {T}
2526
for s s1
2627
(s s2) || (s s3 && push!(s3, s))
2728
end
2829
end
30+
function setdiffv!(s4::AbstractVector{T}, s3::AbstractVector{T}, s1::AbstractVector{T}, s2::AbstractVector{T}) where {T}
31+
for s s1
32+
(s s2) ? (s s4 && push!(s4, s)) : (s s3 && push!(s3, s))
33+
end
34+
end
2935
function update_deps!(deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, parent::Operation)
30-
mergesetdiffv!(deps, loopdependencies(parent), reduceddependencies(parent))
36+
mergesetv!(deps, loopdependencies(parent))#, reduceddependencies(parent))
3137
if !(isload(parent) || isconstant(parent)) && parent.instruction.instr (:reduced_add, :reduced_prod, :reduce_to_add, :reduce_to_prod)
3238
mergesetv!(reduceddeps, reduceddependencies(parent))
3339
end
40+
#
3441
nothing
3542
end
3643

@@ -42,19 +49,19 @@ function pushparent!(mpref::ArrayReferenceMetaPosition, parent::Operation)
4249
pushparent!(mpref.parents, mpref.loopdependencies, mpref.reduceddeps, parent)
4350
end
4451
function add_parent!(
45-
parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var, elementbytes::Int = 8
52+
parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var, elementbytes::Int, position::Int
4653
)
4754
parent = if var isa Symbol
4855
getop(ls, var, elementbytes)
4956
elseif var isa Expr #CSE candidate
50-
add_operation!(ls, gensym(:temporary), var, elementbytes)
57+
add_operation!(ls, gensym(:temporary), var, elementbytes, position)
5158
else # assumed constant
5259
add_constant!(ls, var, elementbytes)
5360
end
5461
pushparent!(parents, deps, reduceddeps, parent)
5562
end
5663
function add_reduction!(
57-
parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var::Symbol, elementbytes::Int = 8
64+
parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet, var::Symbol, elementbytes::Int
5865
)
5966
get!(ls.opdict, var) do
6067
add_constant!(ls, var, elementbytes)
@@ -80,10 +87,10 @@ function update_reduction_status!(parentvec::Vector{Operation}, deps::Vector{Sym
8087
end
8188
end
8289
function add_reduction_update_parent!(
83-
parents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet,
84-
var::Symbol, instr::Symbol, directdependency::Bool, elementbytes::Int = 8
90+
vparents::Vector{Operation}, deps::Vector{Symbol}, reduceddeps::Vector{Symbol}, ls::LoopSet,
91+
parent::Operation, instr::Symbol, directdependency::Bool, elementbytes::Int
8592
)
86-
parent = getop(ls, var, elementbytes)
93+
var = name(parent)
8794
isouterreduction = parent.instruction === LOOPCONSTANT
8895
Instr = instruction(ls, instr)
8996
instrclass = reduction_instruction_class(Instr) # key allows for faster lookups
@@ -110,27 +117,27 @@ function add_reduction_update_parent!(
110117
reductsym = var
111118
reductcombine = Symbol("")
112119
end
113-
setdiffv!(reduceddeps, deps, loopdependencies(reductinit))
114120
combineddeps = copy(deps); mergesetv!(combineddeps, reduceddeps)
115-
directdependency && pushparent!(parents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
116-
update_reduction_status!(parents, combineddeps, name(reductinit))
121+
directdependency && pushparent!(vparents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
122+
update_reduction_status!(vparents, combineddeps, name(reductinit))
117123
# this is the op added by add_compute
118-
op = Operation(length(operations(ls)), reductsym, elementbytes, instr, compute, deps, reduceddeps, parents)
124+
op = Operation(length(operations(ls)), reductsym, elementbytes, instr, compute, deps, reduceddeps, vparents)
119125
parent.instruction === LOOPCONSTANT && push!(ls.outer_reductions, identifier(op))
120126
opout = pushop!(ls, op, var) # note this overwrites the entry in the operations dict, but not the vector
127+
# isouterreduction || iszero(length(reduceddeps)) && return opout
121128
isouterreduction && return opout
122129
# create child op, which is the reduction combination
123-
childdeps = Symbol[]; childrdeps = Symbol[]; childparents = Operation[]
124-
pushparent!(childparents, childdeps, childrdeps, op) # reduce op
125-
pushparent!(childparents, childdeps, childrdeps, parent) # to
130+
childrdeps = Symbol[]; childparents = Operation[ op, parent ]
131+
childdeps = loopdependencies(reductinit)
132+
setdiffv!(childrdeps, loopdependencies(op), childdeps)
126133
child = Operation(
127134
length(operations(ls)), name(parent), elementbytes, reductcombine, compute, childdeps, childrdeps, childparents
128135
)
129136
pushop!(ls, child, name(parent))
130137
opout
131138
end
132139
function add_compute!(
133-
ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int = 8,
140+
ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int, position::Int,
134141
mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing
135142
)
136143
@assert ex.head === :call
@@ -149,12 +156,12 @@ function add_compute!(
149156
if isref
150157
if mpref == argref
151158
reduction = true
152-
add_load!(ls, var, mpref, elementbytes)
159+
add_load!(ls, var, argref, elementbytes)
153160
else
154161
pushparent!(parents, deps, reduceddeps, add_load!(ls, gensym(:tempload), argref, elementbytes))
155162
end
156163
else
157-
add_parent!(parents, deps, reduceddeps, ls, arg, elementbytes)
164+
add_parent!(parents, deps, reduceddeps, ls, arg, elementbytes, position)
158165
end
159166
elseif arg ls.loopsymbols
160167
loopsym = gensym(arg)
@@ -164,11 +171,30 @@ function add_compute!(
164171
push!(ls.refs_aliasing_syms, loopsymop.ref)
165172
pushparent!(parents, deps, reduceddeps, loopsymop)
166173
else
167-
add_parent!(parents, deps, reduceddeps, ls, arg, elementbytes)
174+
add_parent!(parents, deps, reduceddeps, ls, arg, elementbytes, position)
168175
end
169176
end
177+
if iszero(length(deps)) && reduction
178+
loopnestview = view(ls.loopsymbols, 1:position)
179+
append!(deps, loopnestview)
180+
append!(reduceddeps, loopnestview)
181+
else
182+
loopnestview = view(ls.loopsymbols, 1:position)
183+
newloopdeps = Symbol[]; newreduceddeps = Symbol[];
184+
setdiffv!(newloopdeps, newreduceddeps, deps, loopnestview)
185+
mergesetv!(newreduceddeps, reduceddeps)
186+
deps = newloopdeps; reduceddeps = newreduceddeps
187+
end
170188
if reduction || search_tree(parents, var)
171-
add_reduction_update_parent!(parents, deps, reduceddeps, ls, var, instr, reduction, elementbytes)
189+
parent = getop(ls, var, elementbytes)
190+
setdiffv!(reduceddeps, deps, loopdependencies(parent))
191+
if length(reduceddeps) == 0
192+
push!(parents, parent)
193+
op = Operation(length(operations(ls)), var, elementbytes, instruction(ls,instr), compute, deps, reduceddeps, parents)
194+
pushop!(ls, op, var)
195+
else
196+
add_reduction_update_parent!(parents, deps, reduceddeps, ls, parent, instr, reduction, elementbytes)
197+
end
172198
else
173199
op = Operation(length(operations(ls)), var, elementbytes, instruction(ls,instr), compute, deps, reduceddeps, parents)
174200
pushop!(ls, op, var)

src/add_constants.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ function add_constant!(ls::LoopSet, var::Symbol, mpref::ArrayReferenceMetaPositi
1717
pushop!(ls, op, temp)
1818
end
1919
# This version has loop dependencies. var gets assigned to sym when lowering.
20+
# value is what will get assigned within the loop.
21+
# assignedsym will be assigned to value within the preamble
2022
function add_constant!(
2123
ls::LoopSet, value::Symbol, deps::Vector{Symbol}, assignedsym::Symbol = gensym(:constant), elementbytes::Int = 8, f::Symbol = Symbol("")
2224
)

src/add_ifelse.jl

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,22 @@
44
## although stores and return values will.
55

66

7-
function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int = 8, mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing)
7+
function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int, position::Int, mpref::Union{Nothing,ArrayReferenceMetaPosition} = nothing)
88
# for now, just simple 1-liners
99
@assert length(RHS.args) == 3 "if statements without an else cannot be assigned to a variable."
1010
condition = first(RHS.args)
11-
condop = add_compute!(ls, gensym(:mask), condition, elementbytes, mpref)
11+
condop = add_compute!(ls, gensym(:mask), condition, elementbytes, position, mpref)
1212
iftrue = RHS.args[2]
1313
(iftrue isa Expr && iftrue.head !== :call) && throw("Only calls or constant expressions are currently supported in if/else blocks.")
14-
trueop = add_operation!(ls, Symbol(:iftrue), iftrue, elementbytes)
14+
trueop = add_operation!(ls, Symbol(:iftrue), iftrue, elementbytes, position)
1515
iffalse = RHS.args[3]
1616
(iffalse isa Expr && iffalse.head !== :call) && throw("Only calls or constant expressions are currently supported in if/else blocks.")
17-
falseop = add_operation!(ls, Symbol(:iffalse), iffalse, elementbytes)
17+
falseop = add_operation!(ls, Symbol(:iffalse), iffalse, elementbytes, position)
1818

1919
add_compute!(ls, LHS, :vifelse, [condop, trueop, falseop], elementbytes)
2020
end
2121

22-
function add_andblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int)
22+
function add_andblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int, position::Int)
2323
if LHS isa Symbol
2424
altop = getop(ls, LHS)
2525
return add_compute!(ls, LHS, :vifelse, [condop, rhsop, altop], elementbytes)
@@ -29,27 +29,27 @@ function add_andblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, el
2929
throw("Don't know how to assign onto $LHS.")
3030
end
3131
end
32-
function add_andblock!(ls::LoopSet, condop::Operation, LHS, RHS::Expr, elementbytes::Int)
33-
rhsop = add_compute!(ls, gensym(:iftruerhs), RHS, elementbytes)
34-
add_andblock!(ls, condop, LHS, rhsop, elementbytes)
32+
function add_andblock!(ls::LoopSet, condop::Operation, LHS, RHS::Expr, elementbytes::Int, position::Int)
33+
rhsop = add_compute!(ls, gensym(:iftruerhs), RHS, elementbytes, position)
34+
add_andblock!(ls, condop, LHS, rhsop, elementbytes, position)
3535
end
36-
function add_andblock!(ls::LoopSet, condop::Operation, LHS, RHS, elementbytes::Int)
36+
function add_andblock!(ls::LoopSet, condop::Operation, LHS, RHS, elementbytes::Int, position::Int)
3737
rhsop = getop(ls, RHS)
38-
add_andblock!(ls, condop, LHS, rhsop, elementbytes)
38+
add_andblock!(ls, condop, LHS, rhsop, elementbytes, position)
3939
end
40-
function add_andblock!(ls::LoopSet, condexpr::Expr, condeval::Expr, elementbytes::Int)
41-
condop = add_compute!(ls, gensym(:mask), condexpr, elementbytes)
40+
function add_andblock!(ls::LoopSet, condexpr::Expr, condeval::Expr, elementbytes::Int, position::Int)
41+
condop = add_compute!(ls, gensym(:mask), condexpr, elementbytes, position)
4242
@assert condeval.head === :(=)
4343
@assert length(condeval.args) == 2
4444
LHS = condeval.args[1]
4545
RHS = condeval.args[2]
46-
add_andblock!(ls, condop, LHS, RHS, elementbytes)
46+
add_andblock!(ls, condop, LHS, RHS, elementbytes, position)
4747
end
48-
function add_andblock!(ls::LoopSet, ex::Expr, elementbytes::Int)
49-
add_andblock!(ls, first(ex.args)::Expr, last(ex.args)::Expr, elementbytes)
48+
function add_andblock!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
49+
add_andblock!(ls, first(ex.args)::Expr, last(ex.args)::Expr, elementbytes, position)
5050
end
5151

52-
function add_orblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int)
52+
function add_orblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, elementbytes::Int, position::Int)
5353
if LHS isa Symbol
5454
altop = getop(ls, LHS)
5555
return add_compute!(ls, LHS, :vifelse, [condop, altop, rhsop], elementbytes)
@@ -60,23 +60,23 @@ function add_orblock!(ls::LoopSet, condop::Operation, LHS, rhsop::Operation, ele
6060
throw("Don't know how to assign onto $LHS.")
6161
end
6262
end
63-
function add_orblock!(ls::LoopSet, condop::Operation, LHS, RHS::Expr, elementbytes::Int)
64-
rhsop = add_compute!(ls, gensym(:iffalserhs), RHS, elementbytes)
65-
add_orblock!(ls, condop, LHS, rhsop, elementbytes)
63+
function add_orblock!(ls::LoopSet, condop::Operation, LHS, RHS::Expr, elementbytes::Int, position::Int)
64+
rhsop = add_compute!(ls, gensym(:iffalserhs), RHS, elementbytes, position)
65+
add_orblock!(ls, condop, LHS, rhsop, elementbytes, position)
6666
end
67-
function add_orblock!(ls::LoopSet, condop::Operation, LHS, RHS, elementbytes::Int)
67+
function add_orblock!(ls::LoopSet, condop::Operation, LHS, RHS, elementbytes::Int, position::Int)
6868
rhsop = getop(ls, RHS)
69-
add_orblock!(ls, condop, LHS, rhsop, elementbytes)
69+
add_orblock!(ls, condop, LHS, rhsop, elementbytes, position)
7070
end
71-
function add_orblock!(ls::LoopSet, condexpr::Expr, condeval::Expr, elementbytes::Int)
72-
condop = add_compute!(ls, gensym(:mask), condexpr, elementbytes)
71+
function add_orblock!(ls::LoopSet, condexpr::Expr, condeval::Expr, elementbytes::Int, position::Int)
72+
condop = add_compute!(ls, gensym(:mask), condexpr, elementbytes, position)
7373
@assert condeval.head === :(=)
7474
@assert length(condeval.args) == 2
7575
LHS = condeval.args[1]
7676
RHS = condeval.args[2]
77-
add_orblock!(ls, condop, LHS, RHS, elementbytes)
77+
add_orblock!(ls, condop, LHS, RHS, elementbytes, position)
7878
end
79-
function add_orblock!(ls::LoopSet, ex::Expr, elementbytes::Int)
80-
add_orblock!(ls, first(ex.args)::Expr, last(ex.args)::Expr, elementbytes)
79+
function add_orblock!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
80+
add_orblock!(ls, first(ex.args)::Expr, last(ex.args)::Expr, elementbytes, position)
8181
end
8282

src/add_stores.jl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ function add_store!(
4040
if ref == opp.ref.ref
4141
id = opp.identifier
4242
break
43-
else
44-
@show ref opp.ref.ref
43+
# else
44+
# @show ref opp.ref.ref
4545
end
4646
end
4747
end
@@ -69,6 +69,14 @@ function add_store_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int =
6969
array, raw_indices = ref_from_ref(ex)
7070
add_store!(ls, var, array, raw_indices, elementbytes)
7171
end
72+
function add_store_ref!(ls::LoopSet, var, ex::Expr, elementbytes::Int = 8)
73+
# array, raw_indices = ref_from_ref(ex)
74+
# mpref = array_reference_meta!(ls, array, raw_indices, elementbytes)
75+
# c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes)
76+
# add_store!(ls, name(c), mpref, elementbytes)
77+
c = add_constant!(ls, var, elementbytes)
78+
add_store_ref!(ls, name(c), ex, elementbytes)
79+
end
7280
function add_store_setindex!(ls::LoopSet, ex::Expr, elementbytes::Int = 8)
7381
array, raw_indices = ref_from_setindex(ex)
7482
add_store!(ls, (ex.args[2])::Symbol, array, rawindices, elementbytes)

src/broadcast.jl

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,16 +163,21 @@ function add_broadcast!(
163163
# this is the var name in the loop
164164
parents = Operation[]
165165
deps = Symbol[]
166-
reduceddeps = Symbol[]
166+
# reduceddeps = Symbol[]
167167
for (i,arg) enumerate(args)
168168
argname = gensym(:arg)
169169
pushpreamble!(ls, Expr(:(=), argname, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,@__FILE__), Expr(:ref, bcargs, i))))
170170
# dynamic dispatch
171171
parent = add_broadcast!(ls, gensym(:temp), argname, loopsyms, arg, elementbytes)::Operation
172-
pushparent!(parents, deps, reduceddeps, parent)
172+
push!(parents, parent)
173+
mergesetdiffv!(deps, loopdependencies(parent), reduceddependencies(parent))
174+
# if !(isload(parent) || isconstant(parent))# && parent.instruction.instr ∉ (:reduced_add, :reduced_prod, :reduce_to_add, :reduce_to_prod)
175+
# mergesetv!(reduceddeps, reduceddependencies(parent))
176+
# end
177+
# pushparent!(parents, deps, reduceddeps, parent)
173178
end
174179
op = Operation(
175-
length(operations(ls)), destname, elementbytes, instr, compute, deps, reduceddeps, parents
180+
length(operations(ls)), destname, elementbytes, instr, compute, deps, NOPARENTS, parents
176181
)
177182
pushop!(ls, op, destname)
178183
end

src/constructors.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
function Base.copyto!(ls::LoopSet, q::Expr)
55
q.head === :for || throw("Expression must be a for loop.")
6-
add_loop!(ls, q)
6+
add_loop!(ls, q, 8)
77
end
88

99
function add_ci_call!(q::Expr, f, args, syms, i, mod = nothing)

src/costs.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,17 @@ const COST = Dict{Instruction,InstructionCost}(
146146
Instruction(:sin) => InstructionCost(18,15.0,68.0,23),
147147
Instruction(:cos) => InstructionCost(18,15.0,68.0,26),
148148
Instruction(:sincos) => InstructionCost(25,22.0,70.0,26),
149+
Instruction(:sinpi) => InstructionCost(18,15.0,68.0,23),
150+
Instruction(:cospi) => InstructionCost(18,15.0,68.0,26),
151+
Instruction(:sincospi) => InstructionCost(25,22.0,70.0,26),
149152
Instruction(:log_fast) => InstructionCost(20,20.0,40.0,20),
150153
Instruction(:exp_fast) => InstructionCost(20,20.0,20.0,18),
151154
Instruction(:sin_fast) => InstructionCost(18,15.0,68.0,23),
152155
Instruction(:cos_fast) => InstructionCost(18,15.0,68.0,26),
153156
Instruction(:sincos_fast) => InstructionCost(25,22.0,70.0,26),
157+
Instruction(:sinpi_fast) => InstructionCost(18,15.0,68.0,23),
158+
Instruction(:cospi_fast) => InstructionCost(18,15.0,68.0,26),
159+
Instruction(:sincospi_fast) => InstructionCost(25,22.0,70.0,26),
154160
Instruction(:identity) => InstructionCost(0,0.0,0.0,0),
155161
Instruction(:adjoint) => InstructionCost(0,0.0,0.0,0),
156162
Instruction(:transpose) => InstructionCost(0,0.0,0.0,0),

src/determinestrategy.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ function unroll_no_reductions(ls, order, vectorized, Wshift, size_T)
148148
# @show compute_rt, load_rt
149149
# roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
150150
rt = max(compute_rt, load_rt)
151+
rt == 0.0 && return 4
151152
max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))
152153
end
153154
function determine_unroll_factor(

0 commit comments

Comments
 (0)