diff --git a/.buildkite/runtests.yml b/.buildkite/runtests.yml index bc95451..e6e39e3 100644 --- a/.buildkite/runtests.yml +++ b/.buildkite/runtests.yml @@ -4,6 +4,7 @@ steps: setup: version: - "1" + - "1.10" env: GROUP: CUDA plugins: @@ -30,7 +31,7 @@ steps: matrix: setup: version: - - "1" + - "1.10" env: GROUP: AMDGPU plugins: diff --git a/Project.toml b/Project.toml index df16254..a2842c5 100644 --- a/Project.toml +++ b/Project.toml @@ -5,22 +5,30 @@ version = "1.0.0-DEV" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" -NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" -SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5" SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] -julia = "1.6" \ No newline at end of file +Adapt = "4.3" +DiffEqGPU = "3.4" +Enzyme = "<0.13.35" +ForwardDiff = "0.10" +KernelAbstractions = "<0.9.30" +Optimization = "4.1" +QuasiMonteCarlo = "0.3" +Reexport = "1.2" +SciMLBase = "2.79" +Setfield = "1.1" +SimpleNonlinearSolve = "2.2" +StaticArrays = "1.9" +julia = "1.10" diff --git a/benchmarks/CPU_vs_GPU/benchmark.jl b/benchmarks/CPU_vs_GPU/benchmark.jl index 533ad91..3421e90 100644 --- a/benchmarks/CPU_vs_GPU/benchmark.jl +++ b/benchmarks/CPU_vs_GPU/benchmark.jl @@ -9,7 +9,11 @@ device!(2) N = 10 function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end x0 = @SArray zeros(Float32, N) p = @SArray Float32[1.0, 100.0] diff --git a/benchmarks/CPU_vs_GPU/wp_algs.jl b/benchmarks/CPU_vs_GPU/wp_algs.jl index 3df3a1e..ade5392 100644 --- a/benchmarks/CPU_vs_GPU/wp_algs.jl +++ b/benchmarks/CPU_vs_GPU/wp_algs.jl @@ -9,7 +9,11 @@ device!(2) N = 10 function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end # x0 = @SArray zeros(Float32, N) diff --git a/src/PSOGPU.jl b/src/PSOGPU.jl index c1ee642..4a30f28 100644 --- a/src/PSOGPU.jl +++ b/src/PSOGPU.jl @@ -4,7 +4,7 @@ using SciMLBase, StaticArrays, Setfield, KernelAbstractions using QuasiMonteCarlo, Optimization, SimpleNonlinearSolve, ForwardDiff import Adapt import Adapt: adapt -import Enzyme: autodiff_deferred, Active, Reverse +import Enzyme: autodiff_deferred, Active, Reverse, Const import KernelAbstractions: @atomic, @atomicreplace, @atomicswap using QuasiMonteCarlo import DiffEqGPU: GPUTsit5, make_prob_compatible, vectorized_solve, vectorized_asolve diff --git a/src/hybrid.jl b/src/hybrid.jl index af0502d..5f6d67d 100644 --- a/src/hybrid.jl +++ b/src/hybrid.jl @@ -2,7 +2,7 @@ i = @index(Global, Linear) nlcache = remake(nlprob; u0 = x0s[i]) sol = solve(nlcache, opt; maxiters, abstol, reltol) - result[i] = sol.u + @inbounds result[i] = sol.u end function SciMLBase.solve!( @@ -19,13 +19,14 @@ function SciMLBase.solve!( backend = opt.backend prob = remake(cache.prob, lb = nothing, ub = nothing) - f = Base.Fix2(prob.f.f, prob.p) - ∇f = instantiate_gradient(f, prob.f.adtype) - kernel = simplebfgs_run!(backend) result = cache.start_points copyto!(result, x0s) - nlprob = NonlinearProblem{false}(∇f, prob.u0) + + ∇f = instantiate_gradient(prob.f.f, prob.f.adtype) + + kernel = simplebfgs_run!(backend) + nlprob = SimpleNonlinearSolve.ImmutableNonlinearProblem{false}(∇f, prob.u0, prob.p) nlalg = LocalOpt isa LBFGS ? SimpleLimitedMemoryBroyden(; diff --git a/src/utils.jl b/src/utils.jl index 59565f0..e47bb7a 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,3 +1,8 @@ +import SciMLBase: @add_kwonly, AbstractNonlinearProblem, AbstractNonlinearFunction, + AbstractODEFunction, AbstractODEProblem, warn_paramtype, ConstructionBase, + NullParameters, StandardNonlinearProblem, @reset, updated_u0_p, + remake_initialization_data, maybe_eager_initialize_problem + @inbounds function uniform_itr( dim::Int, lb::AbstractArray{T}, ub::AbstractArray{T}) where {T} (rand(T) * (ub[i] - lb[i]) + lb[i] for i in 1:dim) @@ -342,10 +347,12 @@ Based on the paper: Particle swarm optimization method for constrained optimizat penalty end +#TODO: Possible migration to DifferentiationInterface.jl, +# however I cannot compile GPU-compatible gradients with Enzyme as Mar 2025 @inline function instantiate_gradient(f, adtype::AutoForwardDiff) - (θ, p) -> ForwardDiff.gradient(f, θ) + (θ, p) -> ForwardDiff.gradient(x -> f(x, p), θ) end @inline function instantiate_gradient(f, adtype::AutoEnzyme) - (θ, p) -> autodiff_deferred(Reverse, f, Active, Active(θ))[1][1] + (θ, p) -> autodiff_deferred(Reverse, Const(x -> f(x, p)), Active, Active(θ))[1][1] end diff --git a/test/Project.toml b/test/Project.toml index 93b17e3..acaf70a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,4 @@ [deps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" diff --git a/test/constraints.jl b/test/constraints.jl index ab31e52..35cf6e0 100644 --- a/test/constraints.jl +++ b/test/constraints.jl @@ -1,14 +1,6 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random -DEVICE = get(ENV, "GROUP", "CUDA") - -@eval using $(Symbol(DEVICE)) - -if DEVICE == "CUDA" - backend = CUDABackend() -elseif DEVICE == "AMDGPU" - backend = ROCBackend() -end +include("./utils.jl") Random.seed!(1234) diff --git a/test/gpu.jl b/test/gpu.jl index 4292b63..c072517 100644 --- a/test/gpu.jl +++ b/test/gpu.jl @@ -1,14 +1,6 @@ using PSOGPU, StaticArrays, SciMLBase, Test, LinearAlgebra, Random -DEVICE = get(ENV, "GROUP", "CUDA") - -@eval using $(Symbol(DEVICE)) - -if DEVICE == "CUDA" - backend = CUDABackend() -elseif DEVICE == "AMDGPU" - backend = ROCBackend() -end +include("./utils.jl") @testset "Rosenbrock GPU tests $(N)" for N in 2:4 Random.seed!(1234) @@ -19,7 +11,11 @@ end ub = @SArray fill(Float32(10.0), N) function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end x0 = @SArray zeros(Float32, N) diff --git a/test/lbfgs.jl b/test/lbfgs.jl index b15e7aa..59817d6 100644 --- a/test/lbfgs.jl +++ b/test/lbfgs.jl @@ -1,14 +1,6 @@ using PSOGPU, Optimization, StaticArrays -DEVICE = get(ENV, "GROUP", "CUDA") - -@eval using $(Symbol(DEVICE)) - -if DEVICE == "CUDA" - backend = CUDABackend() -elseif DEVICE == "AMDGPU" - backend = ROCBackend() -end +include("./utils.jl") function objf(x, p) return 1 - x[1]^2 - x[2]^2 @@ -25,7 +17,11 @@ sol = Optimization.solve(prob, N = 10 function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end x0 = @SArray rand(Float32, N) p = @SArray Float32[1.0, 100.0] diff --git a/test/regression.jl b/test/regression.jl index 27f767d..ccf1963 100644 --- a/test/regression.jl +++ b/test/regression.jl @@ -9,7 +9,11 @@ using QuasiMonteCarlo ub = @SArray fill(Float32(10.0), N) function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end x0 = @SArray zeros(Float32, N) @@ -157,7 +161,11 @@ end ub = @SArray fill(Float32(10.0), N) function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end x0 = @SArray zeros(Float32, N) diff --git a/test/reinit.jl b/test/reinit.jl index 1e0e27b..6c60e0c 100644 --- a/test/reinit.jl +++ b/test/reinit.jl @@ -7,9 +7,12 @@ lb = @SArray fill(Float32(-1.0), 3) ub = @SArray fill(Float32(10.0), 3) function rosenbrock(x, p) - sum(p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 for i in 1:(length(x) - 1)) + res = zero(eltype(x)) + for i in 1:(length(x) - 1) + res += p[2] * (x[i + 1] - x[i]^2)^2 + (p[1] - x[i])^2 + end + res end - x0 = @SArray zeros(Float32, 3) p = @SArray Float32[1.0, 100.0] @@ -25,10 +28,10 @@ cache = init(prob, ParallelSyncPSOKernel(n_particles; backend = CPU())) reinit!(cache) -cache = init(prob, PSOGPU.HybridPSO(; local_opt = PSOGPU.BFGS(), backend = backend)) +cache = init(prob, PSOGPU.HybridPSO(; local_opt = PSOGPU.BFGS(), backend = CPU())) reinit!(cache) -cache = init(prob, PSOGPU.HybridPSO(; local_opt = PSOGPU.LBFGS(), backend = backend)) +cache = init(prob, PSOGPU.HybridPSO(; local_opt = PSOGPU.LBFGS(), backend = CPU())) reinit!(cache) diff --git a/test/runtests.jl b/test/runtests.jl index 3925374..e6454c6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,15 @@ using SafeTestsets using Test -const GROUP = get(ENV, "GROUP", "CPU") +global CI_GROUP = get(ENV, "GROUP", "CPU") @safetestset "Regression tests" include("./regression.jl") +@safetestset "Reinitialization tests" include("./reinit.jl") -if GROUP != "CPU" - @safetestset "GPU optimizers tests" include("./gpu.jl") - @safetestset "GPU optimizers with constraints tests" include("./constraints.jl") - @safetestset "GPU hybrid optimizers" include("./lbfgs.jl") +#TODO: Curent throws warning for redefinition with the use of @testset multiple times. Migrate to TestItemRunners.jl +@testset for BACKEND in unique(("CPU", CI_GROUP)) + global GROUP = BACKEND + @testset "$(BACKEND) optimizers tests" include("./gpu.jl") + @testset "$(BACKEND) optimizers with constraints tests" include("./constraints.jl") + @testset "$(BACKEND) hybrid optimizers" include("./lbfgs.jl") end diff --git a/test/utils.jl b/test/utils.jl new file mode 100644 index 0000000..c841b9d --- /dev/null +++ b/test/utils.jl @@ -0,0 +1,10 @@ +global backend = if GROUP == "CUDA" + using CUDA + CUDA.CUDABackend() +elseif GROUP == "AMDGPU" + using AMDGPU + AMDGPU.ROCBackend() +else + using KernelAbstractions + backend = CPU() +end