Skip to content

Commit 7de5585

Browse files
N5N3adienes
andauthored
Use size based MultiplicativeInverse to speedup sequential access of ReshapedArray (#43518)
This performance difference was found when working on #42736. Currently, our `ReshapedArray` use stride based `MultiplicativeInverse` to speed up index transformation. For example, for `a::AbstractArray{T,3}` and `b = vec(a)`, the index transformation is equivalent to: ```julia offset = i - 1 # b[i] d1, r1 = divrem(offset, stride(a, 3)) # stride(a, 3) = size(a, 1) * size(a, 2) d2, r2 = divrem(r1, stride(a, 2)) # stride(a, 2) = size(a, 1) CartesianIndex(r2 + 1, d2 +1, d1 + 1) # a has one-based axes ``` (All the `stride` is replaced with a `MultiplicativeInverse` to accelerate `divrem`) This PR wants to replace the above machinery with: ```julia offset = i - 1 d1, r1 = divrem(offset, size(a, 1)) d2, r2 = divrem(d1, size(a, 2)) CartesianIndex(r1 + 1, r2 +1, d2 + 1) ``` For random access, they should have the same computational cost. But for sequential access, like `sum(b)`, `size` based transformation seems faster. To avoid bottleneck from IO, use `reshape(::CartesianIndices, x...)` to benchmark: ```julia f(x) = let r = 0 for i in eachindex(x) @inbounds r |= +(x[i].I...) end r end a = CartesianIndices((99,100,101)); @Btime f(vec($a)); #2.766 ms --> 2.591 ms @Btime f(reshape($a,990,1010)); #3.412 ms --> 2.626 ms @Btime f(reshape($a,33,300,101)); #3.422 ms --> 2.342 ms ``` I haven't looked into the reason for this performance difference. Beside acceleration, this also makes it possible to reuse the `MultiplicativeInverse` in some cases (like #42736). So I think it might be useful? --------- Co-authored-by: Andy Dienes <[email protected]> Co-authored-by: Andy Dienes <[email protected]>
1 parent 078e4ed commit 7de5585

File tree

3 files changed

+23
-20
lines changed

3 files changed

+23
-20
lines changed

base/abstractarray.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3116,14 +3116,14 @@ end
31163116
function _ind2sub_recurse(inds, ind)
31173117
@inline
31183118
r1 = inds[1]
3119-
indnext, f, l = _div(ind, r1)
3120-
(ind-l*indnext+f, _ind2sub_recurse(tail(inds), indnext)...)
3119+
indnext, indsub = divrem(ind, _indexlength(r1))
3120+
(_lookup(indsub, r1), _ind2sub_recurse(tail(inds), indnext)...)
31213121
end
31223122

3123+
_indexlength(d::Integer) = d
3124+
_indexlength(r::AbstractUnitRange) = length(r)
31233125
_lookup(ind, d::Integer) = ind+1
31243126
_lookup(ind, r::AbstractUnitRange) = ind+first(r)
3125-
_div(ind, d::Integer) = div(ind, d), 1, d
3126-
_div(ind, r::AbstractUnitRange) = (d = length(r); (div(ind, d), first(r), d))
31273127

31283128
# Vectorized forms
31293129
function _sub2ind(inds::Indices{1}, I1::AbstractVector{T}, I::AbstractVector{T}...) where T<:Integer

base/multidimensional.jl

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -673,21 +673,24 @@ module IteratorsMD
673673
# CartesianPartition.
674674
mi = iter.parent.mi
675675
ci = iter.parent.parent
676-
ax, ax1 = axes(ci), Base.axes1(ci)
677-
subs = Base.ind2sub_rs(ax, mi, first(iter.indices[1]))
678-
vl, fl = Base._sub2ind(tail(ax), tail(subs)...), subs[1]
679-
vr, fr = divrem(last(iter.indices[1]) - 1, mi[end]) .+ (1, first(ax1))
676+
ax1 = Base.axes1(ci)
677+
function splitdim1(i, mi)
678+
d, r = divrem(i - 1, mi)
679+
d + 1, r + first(ax1)
680+
end
681+
vl, fl = splitdim1(first(iter.indices[1]), mi[1])
682+
vr, fr = splitdim1(last(iter.indices[1]), mi[1])
683+
# form the iterator for outer dimensions, equivalent to vec(oci), but mi is reused
680684
oci = CartesianIndices(tail(ci.indices))
681-
# A fake CartesianPartition to reuse the outer iterate fallback
682-
outer = @inbounds view(ReshapedArray(oci, (length(oci),), mi), vl:vr)
683-
init = @inbounds dec(oci[tail(subs)...].I, oci.indices) # real init state
685+
roci = ReshapedArray(oci, (length(oci),), tail(mi))
686+
outer = @inbounds view(roci, vl:vr)
684687
# Use Generator to make inner loop branchless
685688
@inline function skip_len_I(i::Int, I::CartesianIndex)
686689
l = i == 1 ? fl : first(ax1)
687690
r = i == length(outer) ? fr : last(ax1)
688691
l - first(ax1), r - l + 1, I
689692
end
690-
(skip_len_I(i, I) for (i, I) in Iterators.enumerate(Iterators.rest(outer, (init, 0))))
693+
(skip_len_I(i, I) for (i, I) in Iterators.enumerate(outer))
691694
end
692695
@inline function simd_outer_range(iter::CartesianPartition{CartesianIndex{2}})
693696
# But for two-dimensional Partitions the above is just a simple one-dimensional range

base/reshapedarray.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -234,10 +234,10 @@ _reshape(R::ReshapedArray, dims::Dims) = _reshape(R.parent, dims)
234234

235235
function __reshape(p::Tuple{AbstractArray,IndexStyle}, dims::Dims)
236236
parent = p[1]
237-
strds = front(size_to_strides(map(length, axes(parent))..., 1))
238-
strds1 = map(s->max(1,Int(s)), strds) # for resizing empty arrays
239-
mi = map(SignedMultiplicativeInverse, strds1)
240-
ReshapedArray(parent, dims, reverse(mi))
237+
szs = front(size(parent))
238+
szs1 = map(s -> max(1, Int(s)), szs) # for resizing empty arrays
239+
mi = map(SignedMultiplicativeInverse, szs1)
240+
ReshapedArray(parent, dims, mi)
241241
end
242242

243243
function __reshape(p::Tuple{AbstractArray{<:Any,0},IndexCartesian}, dims::Dims)
@@ -269,11 +269,11 @@ mightalias(A::ReshapedArray, B::SubArray) = mightalias(parent(A), B)
269269
mightalias(A::SubArray, B::ReshapedArray) = mightalias(A, parent(B))
270270

271271
@inline ind2sub_rs(ax, ::Tuple{}, i::Int) = (i,)
272-
@inline ind2sub_rs(ax, strds, i) = _ind2sub_rs(ax, strds, i - 1)
272+
@inline ind2sub_rs(ax, szs, i) = _ind2sub_rs(ax, szs, i - 1)
273273
@inline _ind2sub_rs(ax, ::Tuple{}, ind) = (ind + first(ax[end]),)
274-
@inline function _ind2sub_rs(ax, strds, ind)
275-
d, r = divrem(ind, strds[1])
276-
(_ind2sub_rs(front(ax), tail(strds), r)..., d + first(ax[end]))
274+
@inline function _ind2sub_rs(ax, szs, ind)
275+
d, r = divrem(ind, szs[1])
276+
(r + first(ax[1]), _ind2sub_rs(tail(ax), tail(szs), d)...)
277277
end
278278
offset_if_vec(i::Integer, axs::Tuple{<:AbstractUnitRange}) = i + first(axs[1]) - 1
279279
offset_if_vec(i::Integer, axs::Tuple) = i

0 commit comments

Comments
 (0)