Implement policy generator for DVSP, and cleanup seed handling

BatyLeo · BatyLeo · commit 987fec867e3f · 2025-08-07T17:45:40.000+02:00
diff --git a/src/DynamicVehicleScheduling/DynamicVehicleScheduling.jl b/src/DynamicVehicleScheduling/DynamicVehicleScheduling.jl
@@ -38,6 +38,7 @@ include("algorithms/anticipative_solver.jl")
 include("learning/features.jl")
 include("learning/2d_features.jl")
 
+include("policy.jl")
 # include("policy/abstract_vsp_policy.jl")
 # include("policy/greedy_policy.jl")
 # include("policy/lazy_policy.jl")
@@ -104,6 +105,20 @@ function Utils.generate_anticipative_solution(
     )
 end
 
+function Utils.generate_policies(b::DynamicVehicleSchedulingBenchmark)
+    lazy = Policy(
+        "Lazy",
+        "Lazy policy that dispatches vehicles only when they are ready.",
+        lazy_policy,
+    )
+    greedy = Policy(
+        "Greedy",
+        "Greedy policy that dispatches vehicles to the nearest customer.",
+        greedy_policy,
+    )
+    return (lazy, greedy)
+end
+
 export DynamicVehicleSchedulingBenchmark
 
 end
diff --git a/src/DynamicVehicleScheduling/environment/environment.jl b/src/DynamicVehicleScheduling/environment/environment.jl
@@ -1,28 +1,35 @@
-struct DVSPEnv{S<:DVSPState} <: Utils.AbstractEnvironment
+mutable struct DVSPEnv{S<:DVSPState,R<:AbstractRNG,SS} <: Utils.AbstractEnvironment
     "associated instance"
     instance::Instance
     "current state"
     state::S
     "scenario the environment will use when not given a specific one"
     scenario::Scenario
+    "random number generator"
+    rng::R
+    "seed for the environment"
+    seed::SS
 end
 
 """
 $TYPEDSIGNATURES
 
 Constructor for [`DVSPEnv`](@ref).
 """
-function DVSPEnv(instance::Instance; seed=nothing, rng=MersenneTwister(seed))
-    scenario = Utils.generate_scenario(instance; rng, seed)
+function DVSPEnv(instance::Instance; seed=nothing)
+    rng = MersenneTwister(seed)
+    scenario = Utils.generate_scenario(instance; rng)
     initial_state = DVSPState(instance; scenario[1]...)
-    return DVSPEnv(instance, initial_state, scenario)
+    return DVSPEnv(instance, initial_state, scenario, rng, seed)
 end
 
 currrent_epoch(env::DVSPEnv) = current_epoch(env.state)
 epoch_duration(env::DVSPEnv) = epoch_duration(env.instance)
 last_epoch(env::DVSPEnv) = last_epoch(env.instance)
 Δ_dispatch(env::DVSPEnv) = Δ_dispatch(env.instance)
 
+Utils.get_seed(env::DVSPEnv) = env.seed
+
 """
 $TYPEDSIGNATURES
 
@@ -59,13 +66,19 @@ $TYPEDSIGNATURES
 Reset the environment to its initial state.
 Also reset the seed if `reset_seed` is set to true.
 """
-function Utils.reset!(env::DVSPEnv, scenario=env.scenario)
-    reset_state!(env.state, env.instance; scenario[1]...)
+function Utils.reset!(env::DVSPEnv; seed=get_seed(env), reset_seed=false)
+    if reset_seed
+        Random.seed!(env.rng, seed)
+    end
+    env.scenario = Utils.generate_scenario(env; rng=env.rng)
+    reset_state!(env.state, env.instance; env.scenario[1]...)
     return nothing
 end
 
 """
-remove dispatched customers, advance time, and add new requests to the environment.
+$TYPEDSIGNATURES
+
+Remove dispatched customers, advance time, and add new requests to the environment.
 """
 function Utils.step!(env::DVSPEnv, routes, scenario=env.scenario)
     reward = -apply_routes!(env.state, routes)
diff --git a/src/DynamicVehicleScheduling/policy.jl b/src/DynamicVehicleScheduling/policy.jl
@@ -0,0 +1,34 @@
+function greedy_policy(env::DVSPEnv; model_builder=highs_model)
+    _, state = observe(env)
+    (; is_postponable) = state
+    nb_postponable_requests = sum(is_postponable)
+    θ = ones(nb_postponable_requests) * 1e9
+    routes = prize_collecting_vsp(θ; instance=state, model_builder)
+    return routes
+end
+
+function lazy_policy(env::DVSPEnv; model_builder=highs_model)
+    _, state = observe(env)
+    nb_postponable_requests = sum(state.is_postponable)
+    θ = ones(nb_postponable_requests) * -1e9
+    routes = prize_collecting_vsp(θ; instance=state, model_builder)
+    return routes
+end
+
+"""
+$TYPEDEF
+
+Kleopatra policy for the Dynamic Vehicle Scheduling Problem.
+"""
+struct KleopatraVSPPolicy{P}
+    prize_predictor::P
+end
+
+function (π::KleopatraVSPPolicy)(env::DVSPEnv; model_builder=highs_model)
+    x, state = observe(env)
+    (; prize_predictor) = π
+    # x = has_2D_features ? compute_2D_features(env) : compute_features(env)
+    θ = prize_predictor(x)
+    routes = prize_collecting_vsp(θ; instance=state, model_builder)
+    return routes
+end
diff --git a/src/DynamicVehicleScheduling/policy/greedy_policy.jl b/src/DynamicVehicleScheduling/policy/greedy_policy.jl
@@ -14,3 +14,12 @@ function (π::GreedyVSPPolicy)(env::DVSPEnv; model_builder=highs_model)
     routes = prize_collecting_vsp(θ; instance=state, model_builder)
     return routes
 end
+
+function greedy_policy(env::DVSPEnv; model_builder=highs_model)
+    _, state = observe(env)
+    (; is_postponable) = state
+    nb_postponable_requests = sum(is_postponable)
+    θ = ones(nb_postponable_requests) * 1e9
+    routes = prize_collecting_vsp(θ; instance=state, model_builder)
+    return routes
+end
diff --git a/src/DynamicVehicleScheduling/policy/kleopatra_policy.jl b/src/DynamicVehicleScheduling/policy/kleopatra_policy.jl
@@ -1,32 +0,0 @@
-"""
-$TYPEDEF
-
-Kleopatra policy for the Dynamic Vehicle Scheduling Problem.
-"""
-struct KleopatraVSPPolicy{P} <: AbstractDynamicVSPPolicy
-    prize_predictor::P
-    has_2D_features::Bool
-end
-
-"""
-$TYPEDSIGNATURES
-
-Custom constructor for [`KleopatraVSPPolicy`](@ref).
-"""
-function KleopatraVSPPolicy(prize_predictor; has_2D_features=nothing)
-    has_2D_features = if isnothing(has_2D_features)
-        size(prize_predictor[1].weight, 2) == 2
-    else
-        has_2D_features
-    end
-    return KleopatraVSPPolicy(prize_predictor, has_2D_features)
-end
-
-function (π::KleopatraVSPPolicy)(env::DVSPEnv; model_builder=highs_model)
-    state = observe(env)
-    (; prize_predictor, has_2D_features) = π
-    x = has_2D_features ? compute_2D_features(env) : compute_features(env)
-    θ = prize_predictor(x)
-    routes = prize_collecting_vsp(θ; instance=state, model_builder)
-    return routes
-end
diff --git a/src/Utils/policy.jl b/src/Utils/policy.jl
@@ -31,12 +31,12 @@ $TYPEDSIGNATURES
 Run the policy on the environment and return the total reward and a dataset of observations.
 By default, the environment is reset before running the policy.
 """
-function run_policy!(policy, env::AbstractEnvironment)
+function run_policy!(policy, env::AbstractEnvironment; kwargs...)
     total_reward = 0.0
     reset!(env; reset_seed=false)
     local labeled_dataset
     while !is_terminated(env)
-        y = policy(env)
+        y = policy(env; kwargs...)
         features, state = observe(env)
         if @isdefined labeled_dataset
             push!(labeled_dataset, DataSample(; x=features, y_true=y, instance=state))
@@ -49,33 +49,35 @@ function run_policy!(policy, env::AbstractEnvironment)
     return total_reward, labeled_dataset
 end
 
-function run_policy!(policy, envs::Vector{<:AbstractEnvironment})
+function run_policy!(policy, envs::Vector{<:AbstractEnvironment}; kwargs...)
     E = length(envs)
     rewards = zeros(Float64, E)
     datasets = map(1:E) do e
-        reward, dataset = run_policy!(policy, envs[e])
+        reward, dataset = run_policy!(policy, envs[e]; kwargs...)
         rewards[e] = reward
         return dataset
     end
     return rewards, vcat(datasets...)
 end
 
-function run_policy!(policy, env::AbstractEnvironment, episodes::Int; seed=get_seed(env))
+function run_policy!(
+    policy, env::AbstractEnvironment, episodes::Int; seed=get_seed(env), kwargs...
+)
     reset!(env; reset_seed=true, seed)
     total_reward = 0.0
     datasets = map(1:episodes) do _i
-        reward, dataset = run_policy!(policy, env)
+        reward, dataset = run_policy!(policy, env; kwargs...)
         total_reward += reward
         return dataset
     end
     return total_reward / episodes, vcat(datasets...)
 end
 
-function run_policy!(policy, envs::Vector{<:AbstractEnvironment}, episodes::Int)
+function run_policy!(policy, envs::Vector{<:AbstractEnvironment}, episodes::Int; kwargs...)
     E = length(envs)
     rewards = zeros(Float64, E)
     datasets = map(1:E) do e
-        reward, dataset = run_policy!(policy, envs[e], episodes)
+        reward, dataset = run_policy!(policy, envs[e], episodes; kwargs...)
         rewards[e] = reward
         return dataset
     end