Skip to content

Commit 84de1a6

Browse files
committed
Tweak/improvement that should primarily benefit statically sized arrays.
1 parent f588015 commit 84de1a6

File tree

4 files changed

+39
-13
lines changed

4 files changed

+39
-13
lines changed

src/costs.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
9494
srt, sl, srp
9595
end
9696

97-
const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 50, 50.0, REGISTER_COUNT)
97+
const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 40, 40.0, REGISTER_COUNT)
9898

9999
instruction_cost(instruction::Instruction) = instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION
100100
instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
@@ -219,8 +219,10 @@ const COST = Dict{Symbol,InstructionCost}(
219219
:vdivlog10add! =>InstructionCost(13,4.0,-2.0),
220220
:sqrt => InstructionCost(15,4.0,-2.0),
221221
:sqrt_fast => InstructionCost(15,4.0,-2.0),
222-
:log => InstructionCost(20,20.0,40.0,20),
222+
:log => InstructionCost(20,20.0,20.0,20),
223+
:log1p => InstructionCost(20,25.0,25.0,20), # FIXME
223224
:exp => InstructionCost(20,20.0,20.0,18),
225+
:expm1 => InstructionCost(20,25.0,25.0,18), # FIXME
224226
:(^) => InstructionCost(40,40.0,40.0,26), # FIXME
225227
:sin => InstructionCost(18,15.0,68.0,23),
226228
:cos => InstructionCost(18,15.0,68.0,26),

src/determinestrategy.jl

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
7070
if opisvectorized
7171
if !unitstride(ls, op, vectorized)# || !isdense(op) # need gather/scatter
7272
r = (1 << Wshift)
73-
srt *= r
73+
srt *= r# * 2
7474
sl *= r
7575
elseif isload(op) & (length(loopdependencies(op)) > 1)# vmov(a/u)pd
7676
# penalize vectorized loads with more than 1 loopdep
@@ -258,7 +258,7 @@ function determine_unroll_factor(
258258
# So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
259259
# if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
260260
# We also make sure register pressure is not too high.
261-
latency = 1
261+
latency = 1.0
262262
# compute_recip_throughput_u = 0.0
263263
compute_recip_throughput = 0.0
264264
visited_nodes = fill(false, length(operations(ls)))
@@ -341,6 +341,8 @@ end
341341
function unroll_cost(X, u₁, u₂, u₁L, u₂L)
342342
u₂factor = (num_iterations(u₂L, u₂)/u₂L)
343343
u₁factor = (num_iterations(u₁L, u₁)/u₁L)
344+
# @show num_iterations(u₂L, u₂)/u₂L, u₂, u₂L
345+
# @show num_iterations(u₁L, u₁)/u₁L, u₁, u₁L
344346
# X[1]*u₂factor*u₁factor + X[4] + X[2] * u₂factor + X[3] * u₁factor
345347
X[1] + X[2] * u₂factor + X[3] * u₁factor + X[4] * u₁factor * u₂factor
346348
end
@@ -366,6 +368,7 @@ function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
366368
for u₂temp u₂range
367369
RR u₁temp*u₂temp*R₁ + u₁temp*R₂ + u₂temp*R₅ || continue
368370
tempcost = unroll_cost(X, u₁temp, u₂temp, u₁L, u₂L)
371+
# @show u₁temp, u₂temp, tempcost
369372
if tempcost bestcost
370373
bestcost = tempcost
371374
u₁best, u₂best = u₁temp, u₂temp
@@ -394,7 +397,7 @@ function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
394397
return solve_unroll_iter(X, R, u₁L, u₂L, u₁low:u₁step:u₁high, u₂low:u₂step:u₂high)
395398
end
396399
u₁low = floor(Int, u₁float)
397-
u₂low = max(u₂step, floor(Int, u₂float)) # must be at least 1
400+
u₂low = max(u₂step, floor(Int, 0.8u₂float)) # must be at least 1
398401
u₁high = solve_unroll_constT(R, u₂low) + u₁step
399402
u₂high = solve_unroll_constU(R, u₁low) + u₂step
400403
maxunroll = REGISTER_COUNT == 32 ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
@@ -498,19 +501,30 @@ function solve_unroll(
498501
u₁ = isstaticloop(u₁loop) ? maybedemotesize(u₁, u₁L) : u₁
499502
return u₁, u₂L, unroll_cost(cost_vec, u₁, u₂L, u₁L, u₂L)
500503
end
501-
u₂L = u₂loopsym === vectorized ? cld(u₂L,W) : u₂L
502-
maxu₂ = min(4maxu₂, u₂L)
504+
u₂Ltemp = u₂loopsym === vectorized ? cld(u₂L, W) : u₂L
505+
maxu₂ = min(4maxu₂, u₂Ltemp)
503506
end
504507
if isstaticloop(u₁loop)
505508
if u₁loopsym !== vectorized && u₁L 4
506509
u₂ = max(1, solve_unroll_constU(reg_pressure, u₁L))
507510
u₂ = isstaticloop(u₂loop) ? maybedemotesize(u₂, u₂L) : u₂
508511
return u₁L, u₂, unroll_cost(cost_vec, u₁L, u₂, u₁L, u₂L)
509512
end
510-
u₁L = u₁loopsym === vectorized ? cld(u₁L,W) : u₁L
511-
maxu₁ = min(4maxu₁, u₁L)
513+
u₁Ltemp = u₁loopsym === vectorized ? cld(u₁L, W) : u₁L
514+
maxu₁ = min(4maxu₁, u₁Ltemp)
512515
end
513-
u₁, u₂, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxu₂, length(u₁loop), length(u₂loop), u₁step, u₂step)
516+
if u₁loopsym === vectorized
517+
u₁Lf = u₁L / W
518+
else
519+
u₁Lf = Float64(u₁L)
520+
end
521+
if u₂loopsym === vectorized
522+
u₂Lf = u₂L / W
523+
else
524+
u₂Lf = Float64(u₂L)
525+
end
526+
# @show u₁Lf, u₂Lf, u₁L, length(u₁loop)
527+
u₁, u₂, cost = solve_unroll(cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step)
514528
# heuristic to more evenly divide small numbers of iterations
515529
if isstaticloop(u₂loop)
516530
u₂ = maybedemotesize(u₂, length(u₂loop), u₁, u₁loop, maxu₂base)
@@ -884,7 +898,7 @@ function evaluate_cost_tile(
884898
if isstore(op) & (!u₁reducesrt) & (!u₂reducesrt)
885899
irreducible_storecosts += rt
886900
end
887-
# @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp
901+
# iiter = convert(Int, iters[id]); @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp, iiter
888902
update_costs!(cost_vec, rt, u₁reducesrt, u₂reducesrt)
889903
update_costs!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
890904
end
@@ -1070,6 +1084,9 @@ function choose_order_cost(ls::LoopSet)
10701084
if num_loops(ls) > 1
10711085
torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = choose_tile(ls)
10721086
else
1087+
torder = names(ls) # dummy
1088+
tunroll = ttile = tvec = Symbol("##undefined##") # dummy
1089+
tU = tT = 0 # dummy
10731090
tc = Inf
10741091
end
10751092
uorder, uvec, uc = choose_unroll_order(ls, tc)

src/graphs.jl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,14 @@ end
764764
765765
Convert to `Float64` for the sake of non-64 bit platforms.
766766
"""
767-
looplengthprod(ls::LoopSet) = prod(Float64 length, ls.loops)
767+
function looplengthprod(ls::LoopSet)
768+
l = 1.0
769+
for loop ls.loops
770+
l *= Float64(length(loop))
771+
end
772+
l
773+
end
774+
# prod(Float64 ∘ length, ls.loops)
768775

769776

770777
function looplength(ls::LoopSet, s::Symbol)

test/gemm.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@
351351
end);
352352
lsr2amb = LoopVectorization.LoopSet(r2ambq);
353353
if LoopVectorization.REGISTER_COUNT == 32
354-
@test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :n, :m, :m, 7, 3)
354+
@test LoopVectorization.choose_order(lsr2amb) == ([:n, :m, :k], :m, :n, :m, 3, 7)
355355
elseif LoopVectorization.REGISTER_COUNT == 16
356356
@test LoopVectorization.choose_order(lsr2amb) == ([:m, :n, :k], :n, :m, :m, 4, 2)
357357
end

0 commit comments

Comments
 (0)