@@ -1501,6 +1501,7 @@ struct LoopOrders
1501
1501
syms_nr:: Vector{Symbol}
1502
1502
syms_r:: Vector{Symbol}
1503
1503
buff:: Vector{Symbol}
1504
+ state:: Vector{Int}
1504
1505
end
1505
1506
1506
1507
function outer_reduct_loopordersplit (ls:: LoopSet )
@@ -1549,18 +1550,20 @@ function LoopOrders(ls::LoopSet)
1549
1550
LoopOrders (
1550
1551
nonreductsyms,
1551
1552
reductsyms,
1552
- Vector {Symbol} (undef, length (ls. loopsymbols))
1553
+ Vector {Symbol} (undef, length (ls. loopsymbols)),
1554
+ Vector {Int} (undef, length (ls. loopsymbols))
1553
1555
)
1554
1556
end
1555
1557
1556
- nonreductview (lo:: LoopOrders ) = view (lo. buff, 1 : length (lo. syms_nr))
1557
- reductview (lo:: LoopOrders ) = view (lo. buff, 1 + length (lo. syms_nr): length (lo. buff))
1558
+ nonreductview (lo:: LoopOrders ) = @inbounds view (lo. buff, 1 : length (lo. syms_nr))
1559
+ reductview (lo:: LoopOrders ) = @inbounds view (lo. buff, 1 + length (lo. syms_nr): length (lo. buff))
1558
1560
function Base. iterate (lo:: LoopOrders )
1559
- copyto ! (nonreductview (lo), lo. syms_nr)
1560
- copyto ! (reductview (lo), lo. syms_r)
1561
+ _copyto ! (nonreductview (lo), lo. syms_nr)
1562
+ _copyto ! (reductview (lo), lo. syms_r)
1561
1563
nr = length (lo. syms_nr)
1562
1564
r = length (lo. syms_r)
1563
- state = zeros (Int, nr + r)
1565
+ state = lo. state
1566
+ _fill! (state,0 )
1564
1567
lo. buff, (view (state, 1 : nr), view (state, 1 + nr: nr+ r))
1565
1568
end
1566
1569
@@ -1582,10 +1585,20 @@ function advance_state!(state)
1582
1585
end
1583
1586
true
1584
1587
end
1585
- function advance_state! (state, Nr)
1588
+ function _copyto! (x,y)
1589
+ @inbounds for i = eachindex (x,y)
1590
+ x[i]= y[i]
1591
+ end
1592
+ end
1593
+ function _fill! (x,y)
1594
+ @inbounds for i = eachindex (x)
1595
+ x[i]= y
1596
+ end
1597
+ end
1598
+ function advance_state! (state, Nr):: Bool
1586
1599
state_nr = view (state, 1 : Nr)
1587
1600
advance_state! (state_nr) && return true
1588
- fill ! (state_nr, 0 )
1601
+ _fill ! (state_nr, 0 )
1589
1602
advance_state! (view (state, 1 + Nr: length (state)))
1590
1603
end
1591
1604
swap! (x:: AbstractVector , i:: Int , j:: Int ) = (x[j], x[i]) = (x[i], x[j])
@@ -1594,20 +1607,21 @@ function swap!(
1594
1607
src:: AbstractVector{Symbol} ,
1595
1608
offs:: AbstractVector{Int}
1596
1609
)
1597
- copyto ! (dest, src)
1610
+ _copyto ! (dest, src)
1598
1611
for i ∈ eachindex (offs)
1599
1612
sᵢ = offs[i]
1600
1613
sᵢ == 0 || swap! (dest, i, i + sᵢ)
1601
1614
end
1602
1615
end
1603
1616
# This is not a good algorithm
1604
- function Base. iterate (lo:: LoopOrders , (state_nr, state_r))
1617
+ @inline function Base. iterate (lo:: LoopOrders , states)
1618
+ (state_nr, state_r) = states
1605
1619
if advance_state! (state_nr)
1606
1620
swap! (nonreductview (lo), lo. syms_nr, state_nr)
1607
1621
else
1608
1622
advance_state! (state_r) || return nothing
1609
- fill ! (state_nr, 0 )
1610
- copyto ! (nonreductview (lo), lo. syms_nr)
1623
+ _fill ! (state_nr, 0 )
1624
+ _copyto ! (nonreductview (lo), lo. syms_nr)
1611
1625
swap! (reductview (lo), lo. syms_r, state_r)
1612
1626
end
1613
1627
lo. buff, (state_nr, state_r)
@@ -1644,7 +1658,7 @@ function choose_unroll_order(
1644
1658
cost_temp = evaluate_cost_unroll (ls, new_order, new_vec, lowest_cost, sld)
1645
1659
if cost_temp < lowest_cost
1646
1660
lowest_cost = cost_temp
1647
- copyto ! (best_order, new_order)
1661
+ _copyto ! (best_order, new_order)
1648
1662
best_vec = new_vec
1649
1663
end
1650
1664
end
@@ -1769,7 +1783,7 @@ function choose_tile(
1769
1783
bestu₂ = newu₂
1770
1784
bestu₁ = newu₁
1771
1785
loadelim = loadelim_temp
1772
- copyto ! (best_order, new_order)
1786
+ _copyto ! (best_order, new_order)
1773
1787
save_tilecost! (ls)
1774
1788
end
1775
1789
end
@@ -1822,7 +1836,7 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
1822
1836
mismatched = mismatchedstorereductions (ls)
1823
1837
if num_loops (ls) > 1 && tc ≤ uc
1824
1838
@assert ls. loop_order. bestorder === torder
1825
- # copyto !(ls.loop_order.bestorder, torder)
1839
+ # _copyto !(ls.loop_order.bestorder, torder)
1826
1840
return torder,
1827
1841
tunroll,
1828
1842
ttile,
@@ -1833,7 +1847,7 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
1833
1847
shouldinline
1834
1848
# return torder, tvec, 4, 4#5, 5
1835
1849
else
1836
- copyto ! (ls. loop_order. bestorder, uorder)
1850
+ _copyto ! (ls. loop_order. bestorder, uorder)
1837
1851
UF, uunroll = determine_unroll_factor (ls, uorder, uvec)
1838
1852
return uorder,
1839
1853
uunroll,
0 commit comments