Use size based MultiplicativeInverse to speedup sequential access of ReshapedArray (#43518)

N5N3 · adienes · web-flow · commit 7de558502483 · 2025-09-11T10:51:15.000+08:00
This performance difference was found when working on #42736. Currently, our `ReshapedArray` use stride based `MultiplicativeInverse` to speed up index transformation. For example, for `a::AbstractArray{T,3}` and `b = vec(a)`, the index transformation is equivalent to: ```julia offset = i - 1 # b[i] d1, r1 = divrem(offset, stride(a, 3)) # stride(a, 3) = size(a, 1) * size(a, 2) d2, r2 = divrem(r1, stride(a, 2)) # stride(a, 2) = size(a, 1) CartesianIndex(r2 + 1, d2 +1, d1 + 1) # a has one-based axes ``` (All the `stride` is replaced with a `MultiplicativeInverse` to accelerate `divrem`) This PR wants to replace the above machinery with: ```julia offset = i - 1 d1, r1 = divrem(offset, size(a, 1)) d2, r2 = divrem(d1, size(a, 2)) CartesianIndex(r1 + 1, r2 +1, d2 + 1) ``` For random access, they should have the same computational cost. But for sequential access, like `sum(b)`, `size` based transformation seems faster. To avoid bottleneck from IO, use `reshape(::CartesianIndices, x...)` to benchmark: ```julia f(x) = let r = 0 for i in eachindex(x) @inbounds r |= +(x[i].I...) end r end a = CartesianIndices((99,100,101)); @Btime f(vec($a)); #2.766 ms --> 2.591 ms @Btime f(reshape($a,990,1010)); #3.412 ms --> 2.626 ms @Btime f(reshape($a,33,300,101)); #3.422 ms --> 2.342 ms ``` I haven't looked into the reason for this performance difference. Beside acceleration, this also makes it possible to reuse the `MultiplicativeInverse` in some cases (like #42736). So I think it might be useful? --------- Co-authored-by: Andy Dienes <51664769+adienes@users.noreply.github.com> Co-authored-by: Andy Dienes <andydienes@gmail.com>
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
@@ -3116,14 +3116,14 @@ end
 function _ind2sub_recurse(inds, ind)
     @inline
     r1 = inds[1]
-    indnext, f, l = _div(ind, r1)
-    (ind-l*indnext+f, _ind2sub_recurse(tail(inds), indnext)...)
+    indnext, indsub = divrem(ind, _indexlength(r1))
+    (_lookup(indsub, r1), _ind2sub_recurse(tail(inds), indnext)...)
 end
 
+_indexlength(d::Integer) = d
+_indexlength(r::AbstractUnitRange) = length(r)
 _lookup(ind, d::Integer) = ind+1
 _lookup(ind, r::AbstractUnitRange) = ind+first(r)
-_div(ind, d::Integer) = div(ind, d), 1, d
-_div(ind, r::AbstractUnitRange) = (d = length(r); (div(ind, d), first(r), d))
 
 # Vectorized forms
 function _sub2ind(inds::Indices{1}, I1::AbstractVector{T}, I::AbstractVector{T}...) where T<:Integer
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
@@ -673,21 +673,24 @@ module IteratorsMD
         # CartesianPartition.
         mi = iter.parent.mi
         ci = iter.parent.parent
-        ax, ax1 = axes(ci), Base.axes1(ci)
-        subs = Base.ind2sub_rs(ax, mi, first(iter.indices[1]))
-        vl, fl = Base._sub2ind(tail(ax), tail(subs)...), subs[1]
-        vr, fr = divrem(last(iter.indices[1]) - 1, mi[end]) .+ (1, first(ax1))
+        ax1 = Base.axes1(ci)
+        function splitdim1(i, mi)
+            d, r = divrem(i - 1, mi)
+            d + 1, r + first(ax1)
+        end
+        vl, fl = splitdim1(first(iter.indices[1]), mi[1])
+        vr, fr = splitdim1(last(iter.indices[1]), mi[1])
+        # form the iterator for outer dimensions, equivalent to vec(oci), but mi is reused
         oci = CartesianIndices(tail(ci.indices))
-        # A fake CartesianPartition to reuse the outer iterate fallback
-        outer = @inbounds view(ReshapedArray(oci, (length(oci),), mi), vl:vr)
-        init = @inbounds dec(oci[tail(subs)...].I, oci.indices) # real init state
+        roci = ReshapedArray(oci, (length(oci),), tail(mi))
+        outer = @inbounds view(roci, vl:vr)
         # Use Generator to make inner loop branchless
         @inline function skip_len_I(i::Int, I::CartesianIndex)
             l = i == 1 ? fl : first(ax1)
             r = i == length(outer) ? fr : last(ax1)
             l - first(ax1), r - l + 1, I
         end
-        (skip_len_I(i, I) for (i, I) in Iterators.enumerate(Iterators.rest(outer, (init, 0))))
+        (skip_len_I(i, I) for (i, I) in Iterators.enumerate(outer))
     end
     @inline function simd_outer_range(iter::CartesianPartition{CartesianIndex{2}})
         # But for two-dimensional Partitions the above is just a simple one-dimensional range
diff --git a/base/reshapedarray.jl b/base/reshapedarray.jl
@@ -234,10 +234,10 @@ _reshape(R::ReshapedArray, dims::Dims) = _reshape(R.parent, dims)
 
 function __reshape(p::Tuple{AbstractArray,IndexStyle}, dims::Dims)
     parent = p[1]
-    strds = front(size_to_strides(map(length, axes(parent))..., 1))
-    strds1 = map(s->max(1,Int(s)), strds)  # for resizing empty arrays
-    mi = map(SignedMultiplicativeInverse, strds1)
-    ReshapedArray(parent, dims, reverse(mi))
+    szs = front(size(parent))
+    szs1 = map(s -> max(1, Int(s)), szs) # for resizing empty arrays
+    mi = map(SignedMultiplicativeInverse, szs1)
+    ReshapedArray(parent, dims, mi)
 end
 
 function __reshape(p::Tuple{AbstractArray{<:Any,0},IndexCartesian}, dims::Dims)
@@ -269,11 +269,11 @@ mightalias(A::ReshapedArray, B::SubArray) = mightalias(parent(A), B)
 mightalias(A::SubArray, B::ReshapedArray) = mightalias(A, parent(B))
 
 @inline ind2sub_rs(ax, ::Tuple{}, i::Int) = (i,)
-@inline ind2sub_rs(ax, strds, i) = _ind2sub_rs(ax, strds, i - 1)
+@inline ind2sub_rs(ax, szs, i) = _ind2sub_rs(ax, szs, i - 1)
 @inline _ind2sub_rs(ax, ::Tuple{}, ind) = (ind + first(ax[end]),)
-@inline function _ind2sub_rs(ax, strds, ind)
-    d, r = divrem(ind, strds[1])
-    (_ind2sub_rs(front(ax), tail(strds), r)..., d + first(ax[end]))
+@inline function _ind2sub_rs(ax, szs, ind)
+    d, r = divrem(ind, szs[1])
+    (r + first(ax[1]), _ind2sub_rs(tail(ax), tail(szs), d)...)
 end
 offset_if_vec(i::Integer, axs::Tuple{<:AbstractUnitRange}) = i + first(axs[1]) - 1
 offset_if_vec(i::Integer, axs::Tuple) = i