diff --git a/docs/homepage/blog/ospp_report_210370190/index.md b/docs/homepage/blog/ospp_report_210370190/index.md index 688e9428a..3a7fac3e3 100644 --- a/docs/homepage/blog/ospp_report_210370190/index.md +++ b/docs/homepage/blog/ospp_report_210370190/index.md @@ -491,11 +491,11 @@ create_critic(critic_dim) = Chain( create_policy(player) = DDPGPolicy( behavior_actor = NeuralNetworkApproximator( model = create_actor(player), - optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)), + optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)), ), behavior_critic = NeuralNetworkApproximator( model = create_critic(critic_dim), - optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)), + optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)), ), target_actor = NeuralNetworkApproximator( model = create_actor(player), diff --git a/src/ReinforcementLearningCore/NEWS.md b/src/ReinforcementLearningCore/NEWS.md index 472e6e2d9..a44325896 100644 --- a/src/ReinforcementLearningCore/NEWS.md +++ b/src/ReinforcementLearningCore/NEWS.md @@ -1,5 +1,9 @@ # ReinforcementLearningCore.jl Release Notes +#### v0.15.4 + +- Update `Flux.jl` to `v0.16` and fix deprecation warnings and method errors + #### v0.15.3 - Make `FluxApproximator` work with `QBasedPolicy` diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml index 05d099c86..70fc380da 100644 --- a/src/ReinforcementLearningCore/Project.toml +++ b/src/ReinforcementLearningCore/Project.toml @@ -1,6 +1,6 @@ name = "ReinforcementLearningCore" uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6" -version = "0.15.3" +version = "0.15.4" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -31,8 +31,8 @@ CircularArrayBuffers = "0.1.12" Crayons = "4" Distributions = "0.25" FillArrays = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1" -Flux = "0.14" -GPUArrays = "8, 9, 10" +Flux = "0.14, 0.15, 0.16" +GPUArrays = "8, 9, 10, 11" Metal = "1.0" ProgressMeter = "1" Reexport = "1" diff --git a/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl b/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl index 2227e201f..a6f0cb5b9 100644 --- a/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl +++ b/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl @@ -43,5 +43,5 @@ Flux.@layer FluxApproximator trainable=(model,) forward(A::FluxApproximator, args...; kwargs...) = A.model(args...; kwargs...) forward(A::FluxApproximator, env::E, player::AbstractPlayer=current_player(env)) where {E <: AbstractEnv} = env |> (x -> state(x, player)) |> (x -> forward(A, x)) -RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = - Flux.Optimise.update!(A.optimiser_state, A.model, grad.model) +RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = Flux.Optimise.update!(A.optimiser_state, A.model, grad.model) + diff --git a/src/ReinforcementLearningCore/src/policies/learners/target_network.jl b/src/ReinforcementLearningCore/src/policies/learners/target_network.jl index 7a3b8490a..1178ce527 100644 --- a/src/ReinforcementLearningCore/src/policies/learners/target_network.jl +++ b/src/ReinforcementLearningCore/src/policies/learners/target_network.jl @@ -74,9 +74,12 @@ function RLBase.optimise!(tn::TargetNetwork, grad::NamedTuple) tn.n_optimise += 1 if tn.n_optimise % tn.sync_freq == 0 - # polyak averaging - for (dest, src) in zip(Flux.params(target(tn)), Flux.params(tn.network)) - dest .= tn.ρ .* dest .+ (1 - tn.ρ) .* src + # Polyak averaging + src_layers = RLCore.model(tn) + dest_layers = RLCore.target(tn) + for i in 1:length(src_layers) + dest_layers[i].weight .= tn.ρ .* dest_layers[i].weight .+ (1 - tn.ρ) .* src_layers[i].weight + dest_layers[i].bias .= tn.ρ .* dest_layers[i].bias .+ (1 - tn.ρ) .* src_layers[i].bias end tn.n_optimise = 0 end diff --git a/src/ReinforcementLearningCore/test/policies/learners/target_network.jl b/src/ReinforcementLearningCore/test/policies/learners/target_network.jl index e9182ddaa..e4f05396b 100644 --- a/src/ReinforcementLearningCore/test/policies/learners/target_network.jl +++ b/src/ReinforcementLearningCore/test/policies/learners/target_network.jl @@ -10,9 +10,9 @@ using ReinforcementLearningCore @test_throws "AssertionError: `FluxApproximator` model is not on GPU." TargetNetwork(FluxApproximator(model, optimiser), use_gpu=true) end @test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork - @test TargetNetwork(FluxApproximator(model, optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork + @test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork - approx = FluxApproximator(model, optimiser, use_gpu=false) + approx = FluxApproximator(model=model, optimiser=optimiser, use_gpu=false) target_network = TargetNetwork(approx, use_gpu=false) @@ -38,7 +38,7 @@ using ReinforcementLearningCore @testset "Optimise" begin optimiser = Adam() model = Chain(Dense(10, 5, relu), Dense(5, 2)) - approximator = FluxApproximator(model, optimiser) + approximator = FluxApproximator(model=model, optimiser=optimiser) target_network = TargetNetwork(approximator) input = rand(Float32, 10) grad = Flux.Zygote.gradient(target_network) do model @@ -54,7 +54,7 @@ using ReinforcementLearningCore @testset "Sync" begin optimiser = Adam() - model = FluxApproximator(Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser) + model = FluxApproximator(model=Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser=optimiser) target_network = TargetNetwork(model, sync_freq=2, ρ=0.5) input = rand(Float32, 10) @@ -75,9 +75,9 @@ end m = Chain(Dense(4,1)) app = FluxApproximator(model = m, optimiser = Flux.Adam(), use_gpu=true) tn = TargetNetwork(app, sync_freq = 3, use_gpu=true) - @test typeof(model(tn)) == typeof(target(tn)) - p1 = Flux.destructure(model(tn))[1] - pt1 = Flux.destructure(target(tn))[1] + @test typeof(RLCore.model(tn)) == typeof(RLCore.target(tn)) + p1 = Flux.destructure(RLCore.model(tn))[1] + pt1 = Flux.destructure(RLCore.target(tn))[1] @test p1 == pt1 input = gpu(ones(Float32, 4)) grad = Flux.Zygote.gradient(tn) do model @@ -87,16 +87,16 @@ end grad_model = grad[1] RLCore.optimise!(tn, grad_model) - @test p1 != Flux.destructure(model(tn))[1] - @test p1 == Flux.destructure(target(tn))[1] + @test p1 != Flux.destructure(RLCore.model(tn))[1] + @test p1 == Flux.destructure(RLCore.target(tn))[1] RLCore.optimise!(tn, grad_model) - @test p1 != Flux.destructure(model(tn))[1] + @test p1 != Flux.destructure(RLCore.model(tn))[1] @test p1 == Flux.destructure(target(tn))[1] RLCore.optimise!(tn, grad_model) - @test Flux.destructure(target(tn))[1] == Flux.destructure(model(tn))[1] + @test Flux.destructure(RLCore.target(tn))[1] == Flux.destructure(RLCore.model(tn))[1] @test p1 != Flux.destructure(target(tn))[1] - p2 = Flux.destructure(model(tn))[1] + p2 = Flux.destructure(RLCore.model(tn))[1] RLCore.optimise!(tn, grad_model) - @test p2 != Flux.destructure(model(tn))[1] - @test p2 == Flux.destructure(target(tn))[1] + @test p2 != Flux.destructure(RLCore.model(tn))[1] + @test p2 == Flux.destructure(RLCore.target(tn))[1] end diff --git a/src/ReinforcementLearningCore/test/utils/networks.jl b/src/ReinforcementLearningCore/test/utils/networks.jl index f070dc75c..d078928b7 100644 --- a/src/ReinforcementLearningCore/test/utils/networks.jl +++ b/src/ReinforcementLearningCore/test/utils/networks.jl @@ -22,13 +22,13 @@ import ReinforcementLearningBase: RLBase q_values = NN(rand(Float32, 2)) @test size(q_values) == (3,) - gs = gradient(params(NN)) do + gs = gradient(NN) do sum(NN(rand(Float32, 2, 5))) end - old_params = deepcopy(collect(params(NN).params)) + old_params = deepcopy(collect(Flux.trainable(NN).params)) push!(NN, gs) - new_params = collect(params(NN).params) + new_params = collect(Flux.trainable(NN).params) @test old_params != new_params end @@ -72,42 +72,40 @@ import ReinforcementLearningBase: RLBase end @testset "Correctness of gradients" begin @testset "One action per state" begin - @test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias]) + @test Flux.trainable(gn).pre == gn.pre + @test Flux.trainable(gn).μ == gn.μ + @test Flux.trainable(gn).σ == gn.σ action_saver = Matrix[] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(state, is_sampling = true, is_return_log_prob = true) + g = Flux.gradient(gn) do model + a, logp = model(state, is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end #Check that gradients are identical - for (grad1, grad2) in zip(g,g2) - @test grad1 ≈ grad2 - end + @test g == g2 end @testset "Multiple actions per state" begin #Same with multiple actions sampled action_saver = [] state = unsqueeze(state, dims = 2) - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(state, 3) + g1 = Flux.gradient(gn) do model + a, logp = model(state, 3) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end - for (grad1, grad2) in zip(g,g2) - @test grad1 ≈ grad2 - end + @test g1 == g2 end end end @@ -117,7 +115,6 @@ import ReinforcementLearningBase: RLBase gn = GaussianNetwork(Dense(20,15), Dense(15,10), Dense(15,10, softplus)) |> gpu state = rand(Float32, 20,3) |> gpu #batch of 3 states @testset "Forward pass compatibility" begin - @test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias]) m, L = gn(state) @test size(m) == size(L) == (10,3) a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) @@ -134,15 +131,15 @@ import ReinforcementLearningBase: RLBase @testset "Backward pass compatibility" begin @testset "One action sampling" begin action_saver = CuMatrix[] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) + g = Flux.gradient(gn) do model + a, logp = model(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end #Check that gradients are identical @@ -153,15 +150,15 @@ import ReinforcementLearningBase: RLBase @testset "Multiple actions sampling" begin action_saver = [] state = unsqueeze(state, dims = 2) - g = Flux.gradient(Flux.params(gn)) do + g = Flux.gradient(gn) do a, logp = gn(CUDA.CURAND.RNG(), state, 3) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end for (grad1, grad2) in zip(g,g2) @@ -202,7 +199,10 @@ import ReinforcementLearningBase: RLBase μ = Dense(15,10) Σ = Dense(15,10*11÷2) gn = CovGaussianNetwork(pre, μ, Σ) - @test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias]) + @test Flux.trainable(gn).pre == pre + @test Flux.trainable(gn).μ == μ + @test Flux.trainable(gn).Σ == Σ + state = rand(Float32, 20,3) #batch of 3 states #Check that it works in 2D m, L = gn(state) @@ -233,35 +233,34 @@ import ReinforcementLearningBase: RLBase logp_truth = [logpdf(mvn, a) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))] @test stack(logp_truth; dims=2) ≈ dropdims(logps,dims = 1) #test against ground truth action_saver = [] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) + g1 = Flux.gradient(gn) do model + a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver)) mean(logp) end - for (grad1, grad2) in zip(g,g2) - @test grad1 ≈ grad2 - end + @test g1 == g2 + empty!(action_saver) - g3 = Flux.gradient(Flux.params(gn)) do - a, logp = gn(Flux.unsqueeze(state,dims = 2), 3) + + g3 = Flux.gradient(gn) do model + a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g4 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g4 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state, dims = 2), only(action_saver)) mean(logp) end - for (grad1, grad2) in zip(g4,g3) - @test grad1 ≈ grad2 - end + + @test g4 == g3 end @testset "CUDA" begin if (@isdefined CUDA) && CUDA.functional() @@ -271,7 +270,6 @@ import ReinforcementLearningBase: RLBase μ = Dense(15,10) |> gpu Σ = Dense(15,10*11÷2) |> gpu gn = CovGaussianNetwork(pre, μ, Σ) - @test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias]) state = rand(Float32, 20,3)|> gpu #batch of 3 states m, L = gn(Flux.unsqueeze(state,dims = 2)) @test size(m) == (10,1,3) @@ -292,31 +290,31 @@ import ReinforcementLearningBase: RLBase logp_truth = [logpdf(mvn, cpu(a)) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))] @test reduce(hcat, collect(logp_truth)) ≈ dropdims(cpu(logps); dims=1) #test against ground truth action_saver = [] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) + g = Flux.gradient(gn) do model + a, logp = model(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver)) mean(logp) end for (grad1, grad2) in zip(g,g2) @test grad1 ≈ grad2 end empty!(action_saver) - g3 = Flux.gradient(Flux.params(gn)) do - a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), 3) + g3 = Flux.gradient(gn) do model + a, logp = model(rng, Flux.unsqueeze(state,dims = 2), 3) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g4 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g4 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver)) mean(logp) end for (grad1, grad2) in zip(g4,g3) diff --git a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl index 9b0866b69..2c99a2f74 100644 --- a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl +++ b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl @@ -26,7 +26,7 @@ end Dense(ns, 64, relu), Dense(64, na, relu), ), - Flux.Optimise.Optimiser(ClipNorm(0.5), ADAM(1e-5)), + OptimiserChain(ClipNorm(0.5), Adam(1e-5)), ), explorer = EpsilonGreedyExplorer(ϵ_stable=0.01), ), diff --git a/src/ReinforcementLearningEnvironments/test/runtests.jl b/src/ReinforcementLearningEnvironments/test/runtests.jl index 80bb8fe8a..6c1d1b945 100644 --- a/src/ReinforcementLearningEnvironments/test/runtests.jl +++ b/src/ReinforcementLearningEnvironments/test/runtests.jl @@ -14,8 +14,13 @@ using TimerOutputs using Conda using JLD2 -Conda.add("gym") -Conda.add("numpy") +ENV["CONDA_JL_USE_MINIFORGE"] = "1" + +Conda.add("python", Conda.ROOTENV) +Conda.add("numpy", Conda.ROOTENV) +Conda.pip_interop(true, Conda.ROOTENV) +Conda.pip("install", "gym", Conda.ROOTENV) + @testset "ReinforcementLearningEnvironments" begin include("environments/environments.jl") diff --git a/src/ReinforcementLearningFarm/Project.toml b/src/ReinforcementLearningFarm/Project.toml index fd2c22bce..50297f670 100644 --- a/src/ReinforcementLearningFarm/Project.toml +++ b/src/ReinforcementLearningFarm/Project.toml @@ -13,7 +13,7 @@ ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318" [compat] FillArrays = "1" -Flux = "0.14" +Flux = "0.14, 0.15, 0.16" CircularArrayBuffers = "0.1.12" Distributions = "0.25" ReinforcementLearning = "0.11"