Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/homepage/blog/ospp_report_210370190/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,11 @@ create_critic(critic_dim) = Chain(
create_policy(player) = DDPGPolicy(
behavior_actor = NeuralNetworkApproximator(
model = create_actor(player),
optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)),
optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)),
),
behavior_critic = NeuralNetworkApproximator(
model = create_critic(critic_dim),
optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)),
optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)),
),
target_actor = NeuralNetworkApproximator(
model = create_actor(player),
Expand Down
4 changes: 4 additions & 0 deletions src/ReinforcementLearningCore/NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ReinforcementLearningCore.jl Release Notes

#### v0.15.4

- Update `Flux.jl` to `v0.16` and fix deprecation warnings and method errors

#### v0.15.3

- Make `FluxApproximator` work with `QBasedPolicy`
Expand Down
6 changes: 3 additions & 3 deletions src/ReinforcementLearningCore/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "ReinforcementLearningCore"
uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
version = "0.15.3"
version = "0.15.4"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Expand Down Expand Up @@ -31,8 +31,8 @@ CircularArrayBuffers = "0.1.12"
Crayons = "4"
Distributions = "0.25"
FillArrays = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
Flux = "0.14"
GPUArrays = "8, 9, 10"
Flux = "0.14, 0.15, 0.16"
GPUArrays = "8, 9, 10, 11"
Metal = "1.0"
ProgressMeter = "1"
Reexport = "1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ Flux.@layer FluxApproximator trainable=(model,)
forward(A::FluxApproximator, args...; kwargs...) = A.model(args...; kwargs...)
forward(A::FluxApproximator, env::E, player::AbstractPlayer=current_player(env)) where {E <: AbstractEnv} = env |> (x -> state(x, player)) |> (x -> forward(A, x))

RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) =
Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)
RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)

Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,12 @@ function RLBase.optimise!(tn::TargetNetwork, grad::NamedTuple)
tn.n_optimise += 1

if tn.n_optimise % tn.sync_freq == 0
# polyak averaging
for (dest, src) in zip(Flux.params(target(tn)), Flux.params(tn.network))
dest .= tn.ρ .* dest .+ (1 - tn.ρ) .* src
# Polyak averaging
src_layers = RLCore.model(tn)
dest_layers = RLCore.target(tn)
for i in 1:length(src_layers)
dest_layers[i].weight .= tn.ρ .* dest_layers[i].weight .+ (1 - tn.ρ) .* src_layers[i].weight
dest_layers[i].bias .= tn.ρ .* dest_layers[i].bias .+ (1 - tn.ρ) .* src_layers[i].bias
end
tn.n_optimise = 0
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ using ReinforcementLearningCore
@test_throws "AssertionError: `FluxApproximator` model is not on GPU." TargetNetwork(FluxApproximator(model, optimiser), use_gpu=true)
end
@test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
@test TargetNetwork(FluxApproximator(model, optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
@test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork

approx = FluxApproximator(model, optimiser, use_gpu=false)
approx = FluxApproximator(model=model, optimiser=optimiser, use_gpu=false)
target_network = TargetNetwork(approx, use_gpu=false)


Expand All @@ -38,7 +38,7 @@ using ReinforcementLearningCore
@testset "Optimise" begin
optimiser = Adam()
model = Chain(Dense(10, 5, relu), Dense(5, 2))
approximator = FluxApproximator(model, optimiser)
approximator = FluxApproximator(model=model, optimiser=optimiser)
target_network = TargetNetwork(approximator)
input = rand(Float32, 10)
grad = Flux.Zygote.gradient(target_network) do model
Expand All @@ -54,7 +54,7 @@ using ReinforcementLearningCore

@testset "Sync" begin
optimiser = Adam()
model = FluxApproximator(Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser)
model = FluxApproximator(model=Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser=optimiser)
target_network = TargetNetwork(model, sync_freq=2, ρ=0.5)

input = rand(Float32, 10)
Expand All @@ -75,9 +75,9 @@ end
m = Chain(Dense(4,1))
app = FluxApproximator(model = m, optimiser = Flux.Adam(), use_gpu=true)
tn = TargetNetwork(app, sync_freq = 3, use_gpu=true)
@test typeof(model(tn)) == typeof(target(tn))
p1 = Flux.destructure(model(tn))[1]
pt1 = Flux.destructure(target(tn))[1]
@test typeof(RLCore.model(tn)) == typeof(RLCore.target(tn))
p1 = Flux.destructure(RLCore.model(tn))[1]
pt1 = Flux.destructure(RLCore.target(tn))[1]
@test p1 == pt1
input = gpu(ones(Float32, 4))
grad = Flux.Zygote.gradient(tn) do model
Expand All @@ -87,16 +87,16 @@ end
grad_model = grad[1]

RLCore.optimise!(tn, grad_model)
@test p1 != Flux.destructure(model(tn))[1]
@test p1 == Flux.destructure(target(tn))[1]
@test p1 != Flux.destructure(RLCore.model(tn))[1]
@test p1 == Flux.destructure(RLCore.target(tn))[1]
RLCore.optimise!(tn, grad_model)
@test p1 != Flux.destructure(model(tn))[1]
@test p1 != Flux.destructure(RLCore.model(tn))[1]
@test p1 == Flux.destructure(target(tn))[1]
RLCore.optimise!(tn, grad_model)
@test Flux.destructure(target(tn))[1] == Flux.destructure(model(tn))[1]
@test Flux.destructure(RLCore.target(tn))[1] == Flux.destructure(RLCore.model(tn))[1]
@test p1 != Flux.destructure(target(tn))[1]
p2 = Flux.destructure(model(tn))[1]
p2 = Flux.destructure(RLCore.model(tn))[1]
RLCore.optimise!(tn, grad_model)
@test p2 != Flux.destructure(model(tn))[1]
@test p2 == Flux.destructure(target(tn))[1]
@test p2 != Flux.destructure(RLCore.model(tn))[1]
@test p2 == Flux.destructure(RLCore.target(tn))[1]
end
98 changes: 48 additions & 50 deletions src/ReinforcementLearningCore/test/utils/networks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ import ReinforcementLearningBase: RLBase
q_values = NN(rand(Float32, 2))
@test size(q_values) == (3,)

gs = gradient(params(NN)) do
gs = gradient(NN) do
sum(NN(rand(Float32, 2, 5)))
end

old_params = deepcopy(collect(params(NN).params))
old_params = deepcopy(collect(Flux.trainable(NN).params))
push!(NN, gs)
new_params = collect(params(NN).params)
new_params = collect(Flux.trainable(NN).params)

@test old_params != new_params
end
Expand Down Expand Up @@ -72,42 +72,40 @@ import ReinforcementLearningBase: RLBase
end
@testset "Correctness of gradients" begin
@testset "One action per state" begin
@test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias])
@test Flux.trainable(gn).pre == gn.pre
@test Flux.trainable(gn).μ == gn.μ
@test Flux.trainable(gn).σ == gn.σ
action_saver = Matrix[]
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(state, is_sampling = true, is_return_log_prob = true)
g = Flux.gradient(gn) do model
a, logp = model(state, is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
#Check that gradients are identical
for (grad1, grad2) in zip(g,g2)
@test grad1 ≈ grad2
end
@test g == g2
end
@testset "Multiple actions per state" begin
#Same with multiple actions sampled
action_saver = []
state = unsqueeze(state, dims = 2)
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(state, 3)
g1 = Flux.gradient(gn) do model
a, logp = model(state, 3)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
for (grad1, grad2) in zip(g,g2)
@test grad1 ≈ grad2
end
@test g1 == g2
end
end
end
Expand All @@ -117,7 +115,6 @@ import ReinforcementLearningBase: RLBase
gn = GaussianNetwork(Dense(20,15), Dense(15,10), Dense(15,10, softplus)) |> gpu
state = rand(Float32, 20,3) |> gpu #batch of 3 states
@testset "Forward pass compatibility" begin
@test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias])
m, L = gn(state)
@test size(m) == size(L) == (10,3)
a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
Expand All @@ -134,15 +131,15 @@ import ReinforcementLearningBase: RLBase
@testset "Backward pass compatibility" begin
@testset "One action sampling" begin
action_saver = CuMatrix[]
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
g = Flux.gradient(gn) do model
a, logp = model(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
#Check that gradients are identical
Expand All @@ -153,15 +150,15 @@ import ReinforcementLearningBase: RLBase
@testset "Multiple actions sampling" begin
action_saver = []
state = unsqueeze(state, dims = 2)
g = Flux.gradient(Flux.params(gn)) do
g = Flux.gradient(gn) do
a, logp = gn(CUDA.CURAND.RNG(), state, 3)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
for (grad1, grad2) in zip(g,g2)
Expand Down Expand Up @@ -202,7 +199,10 @@ import ReinforcementLearningBase: RLBase
μ = Dense(15,10)
Σ = Dense(15,10*11÷2)
gn = CovGaussianNetwork(pre, μ, Σ)
@test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias])
@test Flux.trainable(gn).pre == pre
@test Flux.trainable(gn).μ == μ
@test Flux.trainable(gn).Σ == Σ

state = rand(Float32, 20,3) #batch of 3 states
#Check that it works in 2D
m, L = gn(state)
Expand Down Expand Up @@ -233,35 +233,34 @@ import ReinforcementLearningBase: RLBase
logp_truth = [logpdf(mvn, a) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))]
@test stack(logp_truth; dims=2) ≈ dropdims(logps,dims = 1) #test against ground truth
action_saver = []
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
g1 = Flux.gradient(gn) do model
a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g,g2)
@test grad1 ≈ grad2
end
@test g1 == g2

empty!(action_saver)
g3 = Flux.gradient(Flux.params(gn)) do
a, logp = gn(Flux.unsqueeze(state,dims = 2), 3)

g3 = Flux.gradient(gn) do model
a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end
g4 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g4 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state, dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g4,g3)
@test grad1 ≈ grad2
end

@test g4 == g3
end
@testset "CUDA" begin
if (@isdefined CUDA) && CUDA.functional()
Expand All @@ -271,7 +270,6 @@ import ReinforcementLearningBase: RLBase
μ = Dense(15,10) |> gpu
Σ = Dense(15,10*11÷2) |> gpu
gn = CovGaussianNetwork(pre, μ, Σ)
@test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias])
state = rand(Float32, 20,3)|> gpu #batch of 3 states
m, L = gn(Flux.unsqueeze(state,dims = 2))
@test size(m) == (10,1,3)
Expand All @@ -292,31 +290,31 @@ import ReinforcementLearningBase: RLBase
logp_truth = [logpdf(mvn, cpu(a)) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))]
@test reduce(hcat, collect(logp_truth)) ≈ dropdims(cpu(logps); dims=1) #test against ground truth
action_saver = []
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
g = Flux.gradient(gn) do model
a, logp = model(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end

g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g,g2)
@test grad1 ≈ grad2
end
empty!(action_saver)
g3 = Flux.gradient(Flux.params(gn)) do
a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), 3)
g3 = Flux.gradient(gn) do model
a, logp = model(rng, Flux.unsqueeze(state,dims = 2), 3)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end
g4 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g4 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g4,g3)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ end
Dense(ns, 64, relu),
Dense(64, na, relu),
),
Flux.Optimise.Optimiser(ClipNorm(0.5), ADAM(1e-5)),
OptimiserChain(ClipNorm(0.5), Adam(1e-5)),
),
explorer = EpsilonGreedyExplorer(ϵ_stable=0.01),
),
Expand Down
9 changes: 7 additions & 2 deletions src/ReinforcementLearningEnvironments/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@ using TimerOutputs
using Conda
using JLD2

Conda.add("gym")
Conda.add("numpy")
ENV["CONDA_JL_USE_MINIFORGE"] = "1"

Conda.add("python", Conda.ROOTENV)
Conda.add("numpy", Conda.ROOTENV)
Conda.pip_interop(true, Conda.ROOTENV)
Conda.pip("install", "gym", Conda.ROOTENV)


@testset "ReinforcementLearningEnvironments" begin
include("environments/environments.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/ReinforcementLearningFarm/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318"

[compat]
FillArrays = "1"
Flux = "0.14"
Flux = "0.14, 0.15, 0.16"
CircularArrayBuffers = "0.1.12"
Distributions = "0.25"
ReinforcementLearning = "0.11"
Expand Down
Loading