Skip to content

Commit 23efbf4

Browse files
committed
some cleanup
1 parent 77d0efb commit 23efbf4

File tree

5 files changed

+9
-166
lines changed

5 files changed

+9
-166
lines changed

src/codegen/lower_load.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ function lower_load_no_optranslation!(
197197
end
198198
push!(q.args, Expr(:(=), mvar, Expr(:call, lv(:VecUnroll), t)))
199199
else
200-
inds = mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, false)#= not unrolled =#
200+
inds = mem_offset_u(op, td, inds_calc_by_ptr_offset, true, 0, ls, false) # not unrolled
201201
loadexpr = Expr(:call, lv(:_vload), sptr(op), inds)
202202
add_memory_mask!(loadexpr, op, td, mask, ls, 0)
203203
push!(loadexpr.args, falseexpr, rs)

src/codegen/lower_store.jl

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ function lower_store_collection!(
107107
t = Expr(:tuple)
108108
for (opid, _) idsformap
109109
opp = first(parents(ops[opidmap[opid]]))
110-
110+
111111
isu₁, isu₂ = isunrolled_sym(opp, u₁loopsym, u₂loopsym, vloopsym, ls)#, __u₂max)
112112
u = Core.ifelse(isu₁, u₁, 1)
113113
if isloopvalue(opp)
@@ -317,12 +317,7 @@ function lower_tiled_store!(
317317
end
318318
end
319319

320-
function donot_tile_store(
321-
ls::LoopSet,
322-
op::Operation,
323-
reductfunc::Symbol,
324-
u₂::Int,
325-
)
320+
function donot_tile_store(ls::LoopSet, op::Operation, reductfunc::Symbol, u₂::Int)
326321
(
327322
(!((reductfunc === Symbol("")) && all(op.ref.loopedindex))) ||
328323
(u₂ 1) ||

src/codegen/lower_threads.jl

Lines changed: 3 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ end
8787
# miter decreases in each iteration of factors
8888
miter, niter = factors[i]
8989

90-
r = (MoW % miter)
90+
r = MoW % miter
9191
# if ((miter * W * U * 2) ≤ M - (W+W)) & ((r == 0) | (miter == (r+1)))
9292
mlarge = (miter * (U * 2)) MoW - 2
9393
# we want `mlarge` enough, or there to be no remainder (`r == 0`)
@@ -103,19 +103,6 @@ end
103103
@inbounds factors[(length(factors)+1)>>>1]
104104
end
105105

106-
# struct ChooseNumBlocks{U,C} <: Function end
107-
# function (cnb::ChooseNumBlocks{U,C})(M::UInt) where {U,C}
108-
# choose_num_blocks(M, StaticInt{U}(), StaticInt{C}())
109-
# end
110-
111-
# @generated function choose_num_block_table(::StaticInt{U}, ::StaticInt{NC}) where {U,NC}
112-
# t = Expr(:tuple)
113-
# for n ∈ 1:NC
114-
# cnb = :(ChooseNumBlocks{$U,$n}())
115-
# push!(t.args, :(@cfunction($cnb, Tuple{UInt,UInt}, (UInt,))))
116-
# end
117-
# t
118-
# end
119106
@generated function choose_num_block_table(::StaticInt{NC}) where {NC}
120107
t = Expr(:tuple)
121108
for n 1:NC
@@ -130,13 +117,6 @@ end
130117
nt,
131118
::StaticInt{NTMAX},
132119
) where {U,NTMAX}
133-
# valid range for nt: 2 ≤ nt ≤ NTMAX
134-
# if NTMAX > 8
135-
# return quote
136-
# $(Expr(:meta,:inline))
137-
# choose_num_blocks_table(M, StaticInt{$U}(), nt, StaticInt{$NTMAX}())
138-
# end
139-
# else
140120
if NTMAX == 2 # `nt` must be `2`
141121
return quote
142122
$(Expr(:meta, :inline))
@@ -166,16 +146,6 @@ function add_bisecting_if_branches!(q, lb, ub, U, isfirst::Bool)
166146
return
167147
end
168148

169-
# @inline function choose_num_blocks_table(M, ::StaticInt{U}, nt, ::StaticInt{NTMAX}) where {U,NTMAX}
170-
# if nt == NTMAX
171-
# choose_num_blocks(M % UInt, StaticInt{U}(), StaticInt{NTMAX}())
172-
# else
173-
# @inbounds fptr = choose_num_block_table(StaticInt{U}(), StaticInt{NTMAX}())[nt]
174-
# VectorizationBase.assume(fptr ≠ C_NULL)
175-
# ccall(fptr, Tuple{UInt,UInt}, (UInt,), M%UInt)
176-
# end
177-
# end
178-
179149
# if a threaded loop is vectorized, call
180150
@inline function choose_num_blocks(M, ::StaticInt{U}, nt) where {U}
181151
_choose_num_blocks(M % UInt, StaticInt{U}(), nt, lv_max_num_threads())
@@ -184,21 +154,6 @@ end
184154
@inline choose_num_blocks(nt, ::StaticInt{NC} = lv_max_num_threads()) where {NC} =
185155
@inbounds choose_num_block_table(StaticInt{NC}())[nt]
186156

187-
188-
189-
# The goal is to minimimize the maximum costs...
190-
# But maybe 'relatively even sizes' heuristics are more robust than fancy modeling?
191-
# At least early on, before lots of test cases with different sorts of loops have informed the modeling.
192-
#
193-
# goal is to produce `nblocks` roughly even block sizes (bM, bN), such that `bM % fM == bN % fN == 0`.
194-
# function roughly_even_blocks(M, N, fM, fN, nblocks)
195-
# M_N_ratio = M / N
196-
# block_per_m = sqrt(nblocks * M_N_ratio) # obv not even
197-
# blocks_per_n = block_per_m / M_N_ratio
198-
# mi = cld(M, fM)
199-
# ni = cld(N, fN)
200-
# block_per_m, blocks_per_n
201-
# end
202157
if Sys.ARCH === :x86_64
203158
@inline function choose_num_threads(
204159
C::T,
@@ -280,7 +235,6 @@ function outer_reduct_combine_expressions(ls::LoopSet, retv)
280235
# push!(q.args, :(@show var"#load#thread#ret#"))
281236
for (i, or) enumerate(ls.outer_reductions)
282237
op = ls.operations[or]
283-
var = name(op)
284238
mvar = mangledvar(op)
285239
instr = instruction(op)
286240
out = Symbol(mvar, "##onevec##")
@@ -328,7 +282,6 @@ function thread_loop_summary!(
328282
)
329283
W = ls.vector_width
330284
@unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
331-
u₂ = u₂max
332285
threadloopnumtag = Int(issecondthreadloop)
333286
lensym = Symbol("#len#thread#$threadloopnumtag#")
334287
define_len = if isstaticloop(threadedloop)
@@ -505,7 +458,7 @@ function thread_one_loops_expr(
505458
thread_loop_summary!(ls, ua, threadedloop, false)
506459
loopboundexpr = Expr(:tuple) # for launched threads
507460
lastboundexpr = Expr(:tuple) # remainder, started on main thread
508-
for (i, loop) enumerate(ls.loops)
461+
for loop ls.loops
509462
if loop === threadedloop
510463
push!(loopboundexpr.args, looprange)
511464
push!(lastboundexpr.args, lastrange)
@@ -546,8 +499,6 @@ function thread_one_loops_expr(
546499
if var"##do#thread##"
547500
var"#threads#tuple#", var"#torelease#tuple#" =
548501
PolyesterWeave.request_threads(var"#nrequest#")
549-
# var"#threads#tuple#", var"#torelease#tuple#" = PolyesterWeave.request_threads(Threads.threadid()%UInt32, var"#nrequest#")
550-
551502
var"#thread#factor#0#" = var"#nthreads#"
552503
$iterdef
553504
var"#thread#id#" = 0x00000000
@@ -627,7 +578,6 @@ function thread_one_loops_expr(
627578
end
628579
$retexpr
629580
end
630-
# Expr(:block, Expr(:meta,:inline), ls.preamble, q)
631581
Expr(:block, ls.preamble, q)
632582
end
633583
function define_vthread_blocks(vloop, u₁loop, u₂loop, u₁, u₂, ntmax, tn)
@@ -710,7 +660,6 @@ function thread_two_loops_expr(
710660
end
711661
@unpack u₁loop, u₂loop, vloop, u₁, u₂max = ua
712662
u₂ = u₂max
713-
W = ls.vector_width
714663
threadedloop1 = getloop(ls, threadedid1)
715664
threadedloop2 = getloop(ls, threadedid2)
716665
define_len1, define_num_unrolls1, loopstart1, iterstop1, looprange1, lastrange1 =
@@ -719,7 +668,7 @@ function thread_two_loops_expr(
719668
thread_loop_summary!(ls, ua, threadedloop2, true)
720669
loopboundexpr = Expr(:tuple)
721670
lastboundexpr = Expr(:tuple)
722-
for (i, loop) enumerate(ls.loops)
671+
for loop ls.loops
723672
if loop === threadedloop1
724673
push!(loopboundexpr.args, looprange1)
725674
push!(lastboundexpr.args, lastrange1)
@@ -741,11 +690,9 @@ function thread_two_loops_expr(
741690
Val(typeof(var"#avx#call#args#")),
742691
flatten_to_tuple(var"#avx#call#args#")...,
743692
))
744-
# _turbo_orig_ = :(_turbo_!(Val{$UNROLL}(), $OPS, $ARF, $AM, $LPSYM, var"#lv#tuple#args#"))
745693
update_return_values = if length(ls.outer_reductions) > 0
746694
retv = loopset_return_value(ls, Val(false))
747695
_turbo_call_ = Expr(:(=), retv, _turbo_call_)
748-
# _turbo_orig_ = Expr(:(=), retv, _turbo_orig_)
749696
outer_reduct_combine_expressions(ls, retv)
750697
else
751698
nothing
@@ -757,15 +704,10 @@ function thread_two_loops_expr(
757704
retexpr = length(ls.outer_reductions) > 0 ? :(return $retv) : :(return nothing)
758705
q = quote
759706
$choose_nthread # UInt
760-
# @show var"#nthreads#"
761707
$loopstart1
762708
$loopstart2
763709
var"##do#thread##" = var"#nthreads#" > one(var"#nthreads#")
764710
if var"##do#thread##"
765-
# if var"#nthreads#" ≤ 1
766-
# $_turbo_orig_
767-
# return $retexpr
768-
# end
769711
$define_len1
770712
$define_len2
771713
$define_num_unrolls1
@@ -913,7 +855,6 @@ function thread_two_loops_expr(
913855
end
914856
$retexpr
915857
end
916-
# Expr(:block, Expr(:meta,:inline), ls.preamble, q)
917858
Expr(:block, ls.preamble, q)
918859
end
919860

@@ -957,8 +898,6 @@ function avx_threads_expr(
957898
)
958899
valid_thread_loop, ua, c = valid_thread_loops(ls)
959900
num_candiates = sum(valid_thread_loop)
960-
# num_to_thread = min(num_candiates, 2)
961-
# candidate_ids =
962901
if (num_candiates == 0) || (nt 1) # it was called from `avx_body` but now `nt` was set to `1`
963902
avx_body(ls, UNROLL)
964903
elseif (num_candiates == 1) || (nt 3)

0 commit comments

Comments
 (0)