@@ -149,14 +149,62 @@ array. The use of `@turbo` macro gives a significant performance boost.
149149- `dij_min`: The minimum value in the first `n` elements of the `dij` array.
150150- `best`: The index of the minimum value in the `dij` array.
151151"""
152- fast_findmin (dij, n) = begin
153- # findmin(@inbounds @view dij[1:n])
154- best = 1
155- @inbounds dij_min = dij[1 ]
156- @turbo for here in 2 : n
157- newmin = dij[here] < dij_min
158- best = newmin ? here : best
159- dij_min = newmin ? dij[here] : dij_min
152+ function fast_findmin end
153+
154+ if Sys. ARCH == :aarch64
155+ fast_findmin (dij, n) = _naive_fast_findmin (@view (dij[begin : n]))
156+ else
157+ function fast_findmin (dij, n)
158+ if n <= 8
159+ return _naive_fast_findmin (@view (dij[begin : n]))
160+ else
161+ return _simd_fast_findmin (dij, n)
162+ end
160163 end
161- dij_min, best
164+ end
165+
166+ function _naive_fast_findmin (dij)
167+ x = @fastmath foldl (min, dij)
168+ i = findfirst (== (x), dij):: Int
169+ x, i
170+ end
171+
172+ function _simd_fast_findmin (dij:: DenseVector{T} , n) where {T}
173+ laneIndices = SIMD. Vec {8, Int} ((1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ))
174+ minvals = SIMD. Vec {8, T} (Inf )
175+ min_indices = SIMD. Vec {8, Int} (0 )
176+
177+ n_batches, remainder = divrem (n, 8 )
178+ lane = VecRange {8} (0 )
179+ i = 1
180+ @inbounds @fastmath for _ in 1 : n_batches
181+ dijs = dij[lane + i]
182+ predicate = dijs < minvals
183+ minvals = vifelse (predicate, dijs, minvals)
184+ min_indices = vifelse (predicate, laneIndices, min_indices)
185+
186+ i += 8
187+ laneIndices += 8
188+ end
189+
190+ # last batch
191+ back_track = 8 - remainder
192+ i -= back_track
193+ laneIndices -= back_track
194+
195+ dijs = dij[lane + i]
196+ predicate = dijs < minvals
197+ minvals = vifelse (predicate, dijs, minvals)
198+ min_indices = vifelse (predicate, laneIndices, min_indices)
199+
200+ min_value = SIMD. minimum (minvals)
201+ min_index = @inbounds min_value == minvals[1 ] ? min_indices[1 ] :
202+ min_value == minvals[2 ] ? min_indices[2 ] :
203+ min_value == minvals[3 ] ? min_indices[3 ] :
204+ min_value == minvals[4 ] ? min_indices[4 ] :
205+ min_value == minvals[5 ] ? min_indices[5 ] :
206+ min_value == minvals[6 ] ? min_indices[6 ] :
207+ min_value == minvals[7 ] ? min_indices[7 ] : min_indices[8 ]
208+
209+ return min_value, min_index
162210end
0 commit comments