diff --git a/Project.toml b/Project.toml index 262d1dd2..4bb57f3b 100644 --- a/Project.toml +++ b/Project.toml @@ -9,9 +9,9 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" LorentzVectorHEP = "f612022c-142a-473f-8cfd-a09cf3793c6c" MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" @@ -33,10 +33,10 @@ EDM4hep = "0.4.0" EnumX = "1.0.4" JSON = "0.21" Logging = "1.9" -LoopVectorization = "0.12.170" LorentzVectorHEP = "0.1.6" Makie = "0.20, 0.21, 0.22, 0.23, 0.24" MuladdMacro = "0.2.4" +SIMD = "3.7.1" Random = "1.9" Statistics = "1.9" StructArrays = "0.6.18, 0.7" diff --git a/src/JetReconstruction.jl b/src/JetReconstruction.jl index dd171900..4e987b5c 100644 --- a/src/JetReconstruction.jl +++ b/src/JetReconstruction.jl @@ -18,6 +18,7 @@ module JetReconstruction using LorentzVectorHEP using MuladdMacro using StructArrays +using SIMD # Import from LorentzVectorHEP methods for those 4-vector types pt2(p::LorentzVector) = LorentzVectorHEP.pt2(p) diff --git a/src/PlainAlgo.jl b/src/PlainAlgo.jl index ab5ec256..beabc4fa 100644 --- a/src/PlainAlgo.jl +++ b/src/PlainAlgo.jl @@ -1,5 +1,3 @@ -using LoopVectorization - """ dist(i, j, rapidity_array, phi_array) diff --git a/src/TiledAlgoLL.jl b/src/TiledAlgoLL.jl index cbe82367..8d9bfbcf 100644 --- a/src/TiledAlgoLL.jl +++ b/src/TiledAlgoLL.jl @@ -5,7 +5,6 @@ using Logging using Accessors -using LoopVectorization # Include struct definitions and basic operations include("TiledAlgoLLStructs.jl") diff --git a/src/Utils.jl b/src/Utils.jl index caa9fe2e..abbf2c05 100644 --- a/src/Utils.jl +++ b/src/Utils.jl @@ -114,14 +114,62 @@ array. The use of `@turbo` macro gives a significant performance boost. - `dij_min`: The minimum value in the first `n` elements of the `dij` array. - `best`: The index of the minimum value in the `dij` array. """ -fast_findmin(dij, n) = begin - # findmin(@inbounds @view dij[1:n]) - best = 1 - @inbounds dij_min = dij[1] - @turbo for here in 2:n - newmin = dij[here] < dij_min - best = newmin ? here : best - dij_min = newmin ? dij[here] : dij_min +function fast_findmin end + +if Sys.ARCH == :aarch64 + fast_findmin(dij, n) = _naive_fast_findmin(@view(dij[begin:n])) +else + function fast_findmin(dij, n) + if n <= 8 + return _naive_fast_findmin(@view(dij[begin:n])) + else + return _simd_fast_findmin(dij, n) + end end - dij_min, best +end + +function _naive_fast_findmin(dij) + x = @fastmath foldl(min, dij) + i = findfirst(==(x), dij)::Int + x, i +end + +function _simd_fast_findmin(dij::DenseVector{T}, n) where {T} + laneIndices = SIMD.Vec{8, Int}((1, 2, 3, 4, 5, 6, 7, 8)) + minvals = SIMD.Vec{8, T}(Inf) + min_indices = SIMD.Vec{8, Int}(0) + + n_batches, remainder = divrem(n, 8) + lane = VecRange{8}(0) + i = 1 + @inbounds @fastmath for _ in 1:n_batches + dijs = dij[lane + i] + predicate = dijs < minvals + minvals = vifelse(predicate, dijs, minvals) + min_indices = vifelse(predicate, laneIndices, min_indices) + + i += 8 + laneIndices += 8 + end + + # last batch + back_track = 8 - remainder + i -= back_track + laneIndices -= back_track + + dijs = dij[lane + i] + predicate = dijs < minvals + minvals = vifelse(predicate, dijs, minvals) + min_indices = vifelse(predicate, laneIndices, min_indices) + + min_value = SIMD.minimum(minvals) + min_index = @inbounds min_value == minvals[1] ? min_indices[1] : + min_value == minvals[2] ? min_indices[2] : + min_value == minvals[3] ? min_indices[3] : + min_value == minvals[4] ? min_indices[4] : + min_value == minvals[5] ? min_indices[5] : + min_value == minvals[6] ? min_indices[6] : + min_value == minvals[7] ? min_indices[7] : min_indices[8] + + return min_value, min_index end