@@ -792,46 +792,48 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols
792
792
mno, i
793
793
end
794
794
function load_elimination_cost_factor! (
795
- cost_vec, reg_pressure, choose_to_inline, ls:: LoopSet , op:: Operation , iters, unrollsyms:: UnrollSymbols , Wshift, size_T
795
+ cost_vec, reg_pressure, choose_to_inline, ls:: LoopSet , op:: Operation , iters, unrollsyms:: UnrollSymbols , Wshift, size_T
796
796
)
797
- @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
798
- if ! iszero (first (isoptranslation (ls, op, unrollsyms)))
799
- rt, lat, rp = cost (ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
800
- # rt = Core.ifelse(isvectorized(op), 0.5rt, rt)
801
- rto = rt
802
- rt *= iters
803
- # rt *= factor1; rp *= factor2;
804
- choose_to_inline[] = true
805
- # for loop ∈ ls.loops
806
- # # If another loop is short, assume that LLVM will unroll it, in which case
807
- # # we want to be a little more conservative in terms of register pressure.
808
- # #FIXME : heuristic hack to get some desired behavior.
809
- # if isstaticloop(loop) && length(loop) ≤ 4
810
- # itersym = loop.itersymbol
811
- # if itersym !== u₁loopsym && itersym !== u₂loopsym
812
- # return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
813
- # # return (0.25, 1.0)
814
- # return true
815
- # end
816
- # end
817
- # end
818
- # # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
819
- # (0.25, 1.0)
820
- # cost_vec[1] -= rt
821
- # cost_vec[1] -= 0.5625 * iters
822
- # cost_vec[1] -= 0.5625 * iters / 2
823
- # @show rto, 0.8rt, op
824
- reg_pressure[1 ] += 0.25 rp
825
- cost_vec[2 ] += rt
826
- reg_pressure[2 ] += rp
827
- cost_vec[3 ] += rt
828
- # currently only place `reg_pressure[3]` is updated
829
- reg_pressure[3 ] += rp
830
- true
831
- else
832
- (1.0 , 1.0 )
833
- false
834
- end
797
+ @unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
798
+ if ! iszero (first (isoptranslation (ls, op, unrollsyms)))
799
+ rt, lat, rp = cost (ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
800
+ # rt = Core.ifelse(isvectorized(op), 0.5rt, rt)
801
+ rto = rt
802
+ rt *= iters
803
+ # rt *= factor1; rp *= factor2;
804
+ choose_to_inline[] = true
805
+ # for loop ∈ ls.loops
806
+ # # If another loop is short, assume that LLVM will unroll it, in which case
807
+ # # we want to be a little more conservative in terms of register pressure.
808
+ # #FIXME : heuristic hack to get some desired behavior.
809
+ # if isstaticloop(loop) && length(loop) ≤ 4
810
+ # itersym = loop.itersymbol
811
+ # if itersym !== u₁loopsym && itersym !== u₂loopsym
812
+ # return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
813
+ # # return (0.25, 1.0)
814
+ # return true
815
+ # end
816
+ # end
817
+ # end
818
+ # u₁c, u₂c = child_dependent_u₁u₂(op)
819
+ # rp = max(zero(rp), rp - one(rp))
820
+ # # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
821
+ # (0.25, 1.0)
822
+ # cost_vec[1] -= rt
823
+ # cost_vec[1] -= 0.5625 * iters
824
+ # cost_vec[1] -= 0.5625 * iters / 2
825
+ # @show rto, 0.8rt, op
826
+ reg_pressure[1 ] += 0.25 rp
827
+ cost_vec[2 ] += rt
828
+ reg_pressure[2 ] += rp
829
+ cost_vec[3 ] += rt
830
+ # currently only place `reg_pressure[3]` is updated
831
+ reg_pressure[3 ] += rp
832
+ true
833
+ else
834
+ (1.0 , 1.0 )
835
+ false
836
+ end
835
837
end
836
838
function loadintostore (ls:: LoopSet , op:: Operation )
837
839
isload (op) || return false # leads to bad behavior more than it helps
@@ -888,6 +890,10 @@ function add_constant_offset_load_elmination_cost!(
888
890
# we treat this as the unrolled loop getting eliminated is split into 2 parts:
889
891
# 1 a non-cost-reduced part, with factor udependent_reduction
890
892
# 2 a cost-reduced part, with factor uindependent_increase
893
+ if opisininnerloop
894
+ u₁c, u₂c = child_dependent_u₁u₂ (op)
895
+ rp = max (zero (rp), rp - one (rp))
896
+ end
891
897
if uid == 1 # u₁reduces was false
892
898
@assert ! u₁reduces
893
899
# max negative offset was in the u₁ unroll direction
0 commit comments