diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 00000000..08e98915
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,37 @@
+name: Benchmark
+on:
+  pull_request:
+    branches:
+      - main
+permissions:
+  pull-requests: write    # action needs to post a comment
+
+jobs:
+  benchmark:
+    name: Julia ${{ matrix.version }} - ${{ matrix.platform.os }} - ${{ matrix.platform.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.platform.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - os: ubuntu-latest
+            arch: x64
+          - os: ubuntu-24.04-arm
+            arch: aarch64
+        version:
+          - '1'
+          - 'lts'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.platform.arch }}
+      - name: Install dependencies
+        run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"'
+      - name: Run benchmarks
+        run: julia -e 'using BenchmarkCI; BenchmarkCI.judge(baseline = "origin/main")'
+      - name: Post results
+        run: julia -e 'using BenchmarkCI; BenchmarkCI.postjudge()'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
index d1458d2d..da2f1ce8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,7 +28,6 @@ Manifest.toml
 
 # Test data files and benchmarking
 jetsavetest*.dat
-benchmark/*
 
 # Misc files
 .DS_Store
diff --git a/Project.toml b/Project.toml
index 51e5e2a4..7e84c26b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,9 +9,9 @@ CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LorentzVectorHEP = "f612022c-142a-473f-8cfd-a09cf3793c6c"
 MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 
 [weakdeps]
@@ -31,10 +31,10 @@ EDM4hep = "0.4.0"
 EnumX = "1.0.4"
 JSON = "0.21"
 Logging = "1.9"
-LoopVectorization = "0.12.170"
 LorentzVectorHEP = "0.1.6"
 Makie = "0.20, 0.21, 0.22"
 MuladdMacro = "0.2.4"
+SIMD = "3.7.1"
 StructArrays = "0.6.18, 0.7"
 Test = "1.9"
 julia = "1.9"
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 00000000..e8a3ff0c
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+JetReconstruction = "44e8cb2c-dfab-4825-9c70-d4808a591196"
+
+[sources]
+JetReconstruction = {path = ".."}
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..d02d73b2
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,7 @@
+# Mini-benchmarks for JetReconstruction
+
+The benchmarks can be follow the standard [`PkgBenchmark.jl`](https://juliaci.github.io/PkgBenchmark.jl/stable/) structure. Can be run locally with you favourite `PkgBenchmark.jl`-compatible tool.
+
+```sh
+julia --project=benchmark benchmark/benchmarks.jl
+```
\ No newline at end of file
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
new file mode 100644
index 00000000..020b2bae
--- /dev/null
+++ b/benchmark/benchmarks.jl
@@ -0,0 +1,65 @@
+using BenchmarkTools
+using JetReconstruction
+
+const events_file_pp = joinpath(@__DIR__, "..", "test", "data", "events.pp13TeV.hepmc3.gz")
+const events_file_ee = joinpath(@__DIR__, "..", "test", "data", "events.eeH.hepmc3.gz")
+
+const pp_events = JetReconstruction.read_final_state_particles(events_file_pp,
+                                                               T = PseudoJet)
+const ee_events = JetReconstruction.read_final_state_particles(events_file_ee, T = EEJet)
+
+function jet_reconstruct_harness(events; algorithm, strategy, power, distance,
+                                 recombine = RecombinationMethods[RecombinationScheme.EScheme],
+                                 ptmin::Real = 5.0, dcut = nothing, njets = nothing,)
+    for event in events
+        cs = jet_reconstruct(event; R = distance, p = power, algorithm = algorithm,
+                             strategy = strategy, recombine...)
+        if !isnothing(njets)
+            finaljets = exclusive_jets(cs; njets = njets)
+        elseif !isnothing(dcut)
+            finaljets = exclusive_jets(cs; dcut = dcut)
+        else
+            finaljets = inclusive_jets(cs; ptmin = ptmin)
+        end
+    end
+end
+
+const SUITE = BenchmarkGroup()
+SUITE["jet_reconstruction"] = BenchmarkGroup(["reconstruction"])
+
+## pp events
+for stg in [RecoStrategy.N2Plain, RecoStrategy.N2Tiled]
+    strategy_name = "$(stg)"
+    SUITE["jet_reconstruction"][strategy_name] = BenchmarkGroup(["pp", strategy_name])
+    for alg in [JetAlgorithm.AntiKt, JetAlgorithm.CA, JetAlgorithm.Kt]
+        for distance in [0.4]
+            power = JetReconstruction.algorithm2power[alg]
+            SUITE["jet_reconstruction"][strategy_name]["Alg=$alg, R=$distance"] = @benchmarkable jet_reconstruct_harness($pp_events;
+                                                                                                                         algorithm = $alg,
+                                                                                                                         strategy = $stg,
+                                                                                                                         power = $power,
+                                                                                                                         distance = $distance,
+                                                                                                                         ptmin = 5.0) evals=1 samples=32
+        end
+    end
+end
+
+## ee events
+SUITE["jet_reconstruction"]["ee"] = BenchmarkGroup(["ee"])
+for alg in [JetAlgorithm.Durham]
+    for distance in [0.4]
+        power = -1
+        SUITE["jet_reconstruction"]["ee"]["Alg=$alg, R=$distance"] = @benchmarkable jet_reconstruct_harness($ee_events;
+                                                                                                            algorithm = $alg,
+                                                                                                            strategy = $RecoStrategy.Best,
+                                                                                                            power = $power,
+                                                                                                            distance = $distance,
+                                                                                                            ptmin = 5.0) evals=1 samples=32
+    end
+end
+
+if abspath(PROGRAM_FILE) == @__FILE__
+    @info "Running benchmark suite"
+    results = run(SUITE, verbose = true)
+    @info results
+end
diff --git a/src/JetReconstruction.jl b/src/JetReconstruction.jl
index 05ce60ea..d8dedbb7 100644
--- a/src/JetReconstruction.jl
+++ b/src/JetReconstruction.jl
@@ -18,6 +18,7 @@ module JetReconstruction
 using LorentzVectorHEP
 using MuladdMacro
 using StructArrays
+using SIMD
 
 # Import from LorentzVectorHEP methods for those 4-vector types
 pt2(p::LorentzVector) = LorentzVectorHEP.pt2(p)
diff --git a/src/PlainAlgo.jl b/src/PlainAlgo.jl
index 2e2db535..cf1611c1 100644
--- a/src/PlainAlgo.jl
+++ b/src/PlainAlgo.jl
@@ -1,4 +1,3 @@
-using LoopVectorization
 
 """
     dist(i, j, rapidity_array, phi_array)
diff --git a/src/TiledAlgoLL.jl b/src/TiledAlgoLL.jl
index 25a16481..34b6e867 100644
--- a/src/TiledAlgoLL.jl
+++ b/src/TiledAlgoLL.jl
@@ -5,7 +5,6 @@
 
 using Logging
 using Accessors
-using LoopVectorization
 
 # Include struct definitions and basic operations
 include("TiledAlgoLLStructs.jl")
diff --git a/src/Utils.jl b/src/Utils.jl
index aec6d82b..bb39bbb6 100644
--- a/src/Utils.jl
+++ b/src/Utils.jl
@@ -149,14 +149,62 @@ array. The use of `@turbo` macro gives a significant performance boost.
 - `dij_min`: The minimum value in the first `n` elements of the `dij` array.
 - `best`: The index of the minimum value in the `dij` array.
 """
-fast_findmin(dij, n) = begin
-    # findmin(@inbounds @view dij[1:n])
-    best = 1
-    @inbounds dij_min = dij[1]
-    @turbo for here in 2:n
-        newmin = dij[here] < dij_min
-        best = newmin ? here : best
-        dij_min = newmin ? dij[here] : dij_min
+function fast_findmin end
+
+if Sys.ARCH == :aarch64
+    fast_findmin(dij, n) = _naive_fast_findmin(@view(dij[begin:n]))
+else
+    function fast_findmin(dij, n)
+        if n <= 8
+            return _naive_fast_findmin(@view(dij[begin:n]))
+        else
+            return _simd_fast_findmin(dij, n)
+        end
     end
-    dij_min, best
+end
+
+function _naive_fast_findmin(dij)
+    x = @fastmath foldl(min, dij)
+    i = findfirst(==(x), dij)::Int
+    x, i
+end
+
+function _simd_fast_findmin(dij::DenseVector{T}, n) where {T}
+    laneIndices = SIMD.Vec{8, Int}((1, 2, 3, 4, 5, 6, 7, 8))
+    minvals = SIMD.Vec{8, T}(Inf)
+    min_indices = SIMD.Vec{8, Int}(0)
+
+    n_batches, remainder = divrem(n, 8)
+    lane = VecRange{8}(0)
+    i = 1
+    @inbounds @fastmath for _ in 1:n_batches
+        dijs = dij[lane + i]
+        predicate = dijs < minvals
+        minvals = vifelse(predicate, dijs, minvals)
+        min_indices = vifelse(predicate, laneIndices, min_indices)
+
+        i += 8
+        laneIndices += 8
+    end
+
+    # last batch
+    back_track = 8 - remainder
+    i -= back_track
+    laneIndices -= back_track
+
+    dijs = dij[lane + i]
+    predicate = dijs < minvals
+    minvals = vifelse(predicate, dijs, minvals)
+    min_indices = vifelse(predicate, laneIndices, min_indices)
+
+    min_value = SIMD.minimum(minvals)
+    min_index = @inbounds min_value == minvals[1] ? min_indices[1] :
+                          min_value == minvals[2] ? min_indices[2] :
+                          min_value == minvals[3] ? min_indices[3] :
+                          min_value == minvals[4] ? min_indices[4] :
+                          min_value == minvals[5] ? min_indices[5] :
+                          min_value == minvals[6] ? min_indices[6] :
+                          min_value == minvals[7] ? min_indices[7] : min_indices[8]
+
+    return min_value, min_index
 end