diff --git a/.gitignore b/.gitignore
index 11531c35..0f68bce6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,4 @@ docs/src/.$kite_power_tools.drawio.bkp
 data/linear_model.arrow
 data/nonlinear_model.arrow
 coverage/lcov.info
+temp_benchmark.jl
diff --git a/src/mtk_model.jl b/src/mtk_model.jl
index 6b34c5a9..9d14a5d2 100644
--- a/src/mtk_model.jl
+++ b/src/mtk_model.jl
@@ -876,7 +876,7 @@ function linear_vsm_eqs!(s, eqs, guesses; aero_force_b, aero_moment_b, group_aer
     return eqs, guesses
 end
 
-function create_sys!(s::SymbolicAWEModel, system::SystemStructure; init_va_b)
+function create_sys!(s::SymbolicAWEModel, system::SystemStructure; init_va_b, bench=false)
     eqs = []
     defaults = Pair{Num, Any}[]
     guesses = Pair{Num, Any}[]
@@ -916,31 +916,15 @@ function create_sys!(s::SymbolicAWEModel, system::SystemStructure; init_va_b)
     eqs, defaults = wing_eqs!(s, eqs, defaults; tether_wing_force, tether_wing_moment, aero_force_b, aero_moment_b, 
         ω_b, α_b, R_b_w, wing_pos, wing_vel, wing_acc, stabilize, fix_nonstiff)
     eqs = scalar_eqs!(s, eqs; R_b_w, wind_vec_gnd, va_wing_b, wing_pos, wing_vel, wing_acc, twist_angle, twist_ω, ω_b, α_b)
-    
-    # te_I = (1/3 * (s.set.mass/8) * te_length^2)
-    # # -damping / I * ω = α_damping
-    # # solve for c: (c * (k*m/s^2) / (k*m^2)) * (m/s)=m/s^2 in wolframalpha
-    # # damping should be in N*m*s
-    # rot_damping = 0.1s.damping * te_length
-
-    # eqs = [
-    #     eqs
-    #     trailing_edge_α[1] ~ (force[:, s.i_A]) ⋅ e_te_A * te_length / te_I - (rot_damping[1] / te_I) * trailing_edge_ω[1] # TODO: add trailing edge
-    #     trailing_edge_α[2] ~ (force[:, s.i_B]) ⋅ e_te_B * te_length / te_I - (rot_damping[2] / te_I) * trailing_edge_ω[2]
-    # ]
-    
+        
     eqs = Symbolics.scalarize.(reduce(vcat, Symbolics.scalarize.(eqs)))
 
-    # discrete_events = [
-    #     true => [
-    #         [Q_b_w[i] ~ normalize(Q_b_w)[i] for i in 1:4]
-    #         [twist_angle[i] ~ clamp(twist_angle[i], -π/2, π/2) for i in eachindex(s.point_groups)]
-    #         ]
-    #     ]
-
-    @info "Creating ODESystem"
-    # @named sys = ODESystem(eqs, t; discrete_events)
-    @time @named sys = ODESystem(eqs, t)
+    ! bench && @info "Creating ODESystem"
+    if bench
+        @named sys = ODESystem(eqs, t)
+    else
+        @time @named sys = ODESystem(eqs, t)
+    end
 
     defaults = [
         defaults
diff --git a/src/symbolic_awe_model.jl b/src/symbolic_awe_model.jl
index 38521001..eb1ffead 100644
--- a/src/symbolic_awe_model.jl
+++ b/src/symbolic_awe_model.jl
@@ -301,7 +301,7 @@ and only update the state variables. Otherwise, it will create a new model from
 """
 function init!(s::SymbolicAWEModel; 
     solver=nothing, stiffness_factor = nothing, delta = nothing, adaptive=true, prn=true, 
-    precompile=false, remake=false, reload=false, 
+    precompile=false, remake=false, reload=false, bench=false,
     lin_outputs=Num[]
 )
     if isnothing(solver)
@@ -322,11 +322,19 @@ function init!(s::SymbolicAWEModel;
         init_Q_b_w, R_b_w, init_va_b = initial_orient(s)
         init!(s.sys_struct, s.set)
         
-        inputs = create_sys!(s, s.sys_struct; init_va_b)
+        inputs = create_sys!(s, s.sys_struct; init_va_b, bench)
         prn && @info "Simplifying the system"
         @suppress_err begin
-            prn ? (@time (sys, _) = structural_simplify(s.full_sys, (inputs, []))) :
-                ((sys, _) = structural_simplify(s.full_sys, (inputs, [])))
+            if prn && !bench
+                @time (sys, _) = structural_simplify(s.full_sys, (inputs, []))
+            elseif bench
+                local elapsed, sys
+                elapsed = @elapsed (sys, _) = structural_simplify(s.full_sys, (inputs, []))
+                s.sys = sys
+                return elapsed
+            else
+                (sys, _) = structural_simplify(s.full_sys, (inputs, []))
+            end
             s.sys = sys
         end
         dt = SimFloat(1/s.set.sample_freq)
@@ -353,13 +361,19 @@ function init!(s::SymbolicAWEModel;
     end
     model_path = joinpath(KiteUtils.get_data_path(), get_model_name(s.set; precompile))
     if !ispath(model_path) || remake
-        init(s)
+        res = init(s)
+        if bench
+            return res
+        end
     end
     _, success = reinit!(s, solver; adaptive, precompile, reload, lin_outputs, prn)
     if !success
         rm(model_path)
         @info "Rebuilding the system. This can take some minutes..."
-        init(s)
+        res = init(s)
+        if bench
+            return res
+        end
         reinit!(s, solver; adaptive, precompile, lin_outputs, prn, reload=true)
     end
     return s.integrator
diff --git a/test/bench_ref.jl b/test/bench_ref.jl
new file mode 100644
index 00000000..67e85f63
--- /dev/null
+++ b/test/bench_ref.jl
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: 2022 Uwe Fechner
+# SPDX-License-Identifier: MIT
+
+const reference = 4.826620521958565e7 # AMD Ryzen 7840U, Julia 1.11, 1 thread
+
+"""
+    cpu_benchmark_scalar(target_time=1.0)
+
+Performs scalar CPU-intensive operations without SIMD for approximately `target_time` seconds.
+This function performs basic arithmetic operations on scalar values to measure CPU performance
+without utilizing vector/SIMD instructions.
+
+Returns the number of operations performed and the actual elapsed time.
+"""
+function cpu_benchmark_scalar(target_time=1.0)
+    # First, do a short calibration run to estimate operations per second
+    calibrate_ops = 1_000_000
+    start_cal = Base.time()
+    
+    # Scalar operations that avoid SIMD optimization
+    result = 0.0
+    x = 1.23456789
+    y = 9.87654321
+    
+    for i in 1:calibrate_ops
+        # Mix of operations to avoid compiler optimizations
+        x = x * 1.000001 + sin(y * 0.001)
+        y = y * 0.999999 + cos(x * 0.001)
+        result += sqrt(abs(x + y))
+    end
+    
+    cal_time = Base.time() - start_cal
+    ops_per_second = calibrate_ops / cal_time
+    
+    # Estimate total operations needed for target time
+    target_ops = Int(round(ops_per_second * target_time))
+    
+    # Main benchmark run
+    start_time = Base.time()
+    result = 0.0
+    x = 1.23456789
+    y = 9.87654321
+    
+    @inbounds for i in 1:target_ops
+        # Scalar arithmetic operations that are hard to vectorize
+        x = x * 1.000001 + sin(y * 0.001)
+        y = y * 0.999999 + cos(x * 0.001)
+        result += sqrt(abs(x + y))
+        
+        # Additional scalar operations to increase workload
+        if i % 1000 == 0
+            x = x / (1.0 + 1e-10)  # Prevent overflow
+            y = y / (1.0 + 1e-10)
+        end
+    end
+    
+    elapsed_time = Base.time() - start_time
+    
+    # Force the compiler to use the result (prevent dead code elimination)
+    println("Benchmark result (ignore): $(result)")
+
+    return target_ops / elapsed_time
+end
+
+"""
+    rel_cpu_performance()
+
+A simple CPU benchmark that performs scalar operations for approximately 1 second.
+This is a standalone function that doesn't depend on any external packages.
+"""
+function rel_cpu_performance()
+    ops = cpu_benchmark_scalar(1.0)
+    println("CPU Performance: $(round(ops/1e6, digits=1)) million operations per second")
+    return ops/reference
+end
\ No newline at end of file
diff --git a/test/bench_simplify.jl b/test/bench_simplify.jl
new file mode 100644
index 00000000..9615af92
--- /dev/null
+++ b/test/bench_simplify.jl
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, 2025 Bart van de Lint, Uwe Fechner
+# SPDX-License-Identifier: MIT
+
+SIMPLE = false
+T_REF = 48.0 # AMD Ryzen 7840U, Julia 1.11, no sys image [s]
+             # 37s with sys image
+
+using Pkg
+if ! ("Test" ∈ keys(Pkg.project().dependencies))
+    using TestEnv; TestEnv.activate()
+end
+using KiteModels, LinearAlgebra, Statistics, Test, Distributed
+include("bench_ref.jl")
+
+# Simulation parameters
+dt = 0.05
+total_time = 10.0  # Longer simulation to see oscillations
+vsm_interval = 3
+steps = Int(round(total_time / dt))
+
+# Steering parameters
+steering_freq = 1/2  # Hz - full left-right cycle frequency
+steering_magnitude = 10.0      # Magnitude of steering input [Nm]
+
+# Function to run benchmark in separate Julia process
+function run_benchmark_subprocess()
+    # Create a temporary script file for the benchmark
+    benchmark_script = """
+    using Pkg
+    if ! ("Test" ∈ keys(Pkg.project().dependencies))
+        using TestEnv; TestEnv.activate()
+    end
+    using KiteModels, LinearAlgebra, Statistics
+    include("test/bench_ref.jl")
+    
+    SIMPLE = $SIMPLE
+    T_REF = $T_REF
+    
+    # Initialize model
+    set = load_settings("system_ram.yaml")
+    set.segments = 3
+    set_values = [-50, 0.0, 0.0]  # Set values of the torques of the three winches. [Nm]
+    set.quasi_static = false
+    set.physical_model = SIMPLE ? "simple_ram" : "ram"
+    
+    sam = SymbolicAWEModel(set)
+    sam.set.abs_tol = 1e-2
+    sam.set.rel_tol = 1e-2
+    rm("data/model_1.11_ram_dynamic_3_seg.bin"; force=true)
+    
+    # Initialize at elevation
+    set.l_tethers[2] += 0.2
+    set.l_tethers[3] += 0.2
+    time_ = init!(sam; remake=false, reload=true, bench=true)
+    @info "Simplify took \$time_ seconds"
+    rel_performance = (T_REF / rel_cpu_performance())/time_
+    
+    # Write results to file for parent process to read
+    open("benchmark_results.tmp", "w") do f
+        println(f, time_)
+        println(f, rel_performance)
+    end
+    """
+    
+    # Write the script to a temporary file
+    temp_script = "temp_benchmark.jl"
+    open(temp_script, "w") do f
+        write(f, benchmark_script)
+    end
+    
+    try
+        # Run the benchmark in a separate Julia process
+        result = run(`julia --project=. $temp_script`)
+        
+        if result.exitcode == 0
+            # Read results from temporary file
+            if isfile("benchmark_results.tmp")
+                lines = readlines("benchmark_results.tmp")
+                time_ = parse(Float64, lines[1])
+                rel_performance = parse(Float64, lines[2])
+                
+                @info "Simplify took $time_ seconds"
+                @info "Relative performance: $rel_performance"
+                @test rel_performance > 0.8
+                
+                # Clean up temporary files
+                rm("benchmark_results.tmp", force=true)
+                rm(temp_script, force=true)
+                
+                return time_, rel_performance
+            else
+                error("Benchmark results file not found")
+            end
+        else
+            error("Benchmark process failed with exit code $(result.exitcode)")
+        end
+    catch e
+        # Clean up temporary files in case of error
+        rm("benchmark_results.tmp", force=true)
+        rm(temp_script, force=true)
+        rethrow(e)
+    end
+end
+
+# Run the benchmark in a separate process
+time_, rel_performance = run_benchmark_subprocess()
+
+# Note: sys object is not available when running in separate process
+# If you need sys, you would need to serialize it or run parts in the main process
+nothing
+
+# Desktop, AMD Ryzen 9 7950X, Julia 1.11:
+# - first  run 34.5 seconds
+# - second run 21.1 seconds
+
+# Laptop, AMD Ryzen 7 7840U, Julia 1.11:
+# - first  run 35.0 seconds
+# - second run 24.0 seconds
+
+