changed again how the policy evaluation works + cleanup

BatyLeo · BatyLeo · commit 926291af0bf2 · 2025-09-26T13:59:39.000+02:00
diff --git a/README.md b/README.md
@@ -6,6 +6,10 @@
 [![Coverage](https://codecov.io/gh/JuliaDecisionFocusedLearning/DecisionFocusedLearningBenchmarks.jl/branch/main/graph/badge.svg)](https://app.codecov.io/gh/JuliaDecisionFocusedLearning/DecisionFocusedLearningBenchmarks.jl)
 [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/JuliaDiff/BlueStyle)
 
+!!! warning 
+    This package is currently under active development. The API may change in future releases.
+    Please refer to the [documentation](https://JuliaDecisionFocusedLearning.github.io/DecisionFocusedLearningBenchmarks.jl/stable/) for the latest updates.
+
 ## What is Decision-Focused Learning?
 
 Decision-focused learning (DFL) is a paradigm that integrates machine learning prediction with combinatorial optimization to make better decisions under uncertainty. Unlike traditional "predict-then-optimize" approaches that optimize prediction accuracy independently of downstream decision quality, DFL directly optimizes end-to-end decision performance.
diff --git a/docs/src/benchmark_interfaces.md b/docs/src/benchmark_interfaces.md
@@ -26,13 +26,13 @@ The package defines a hierarchy of three abstract types:
 
 ```
 AbstractBenchmark
-├── AbstractStochasticBenchmark{exogenous}
+└── AbstractStochasticBenchmark{exogenous}
     └── AbstractDynamicBenchmark{exogenous}
 ```
 
 - **`AbstractBenchmark`**: static, single-stage optimization problems
 - **`AbstractStochasticBenchmark{exogenous}`**: stochastic, single stage optimization problems
- **`AbstractDynamicBenchmark{exogenous}`**: multi-stage sequential decision problems
+- **`AbstractDynamicBenchmark{exogenous}`**: multi-stage sequential decision-making problems
 
 The `{exogenous}` type parameter indicates whether uncertainty distribution comes from external sources (`true`) or is influenced by decisions (`false`), which affects available methods.
 
diff --git a/src/DynamicVehicleScheduling/anticipative_solver.jl b/src/DynamicVehicleScheduling/anticipative_solver.jl
@@ -215,13 +215,15 @@ function anticipative_solver(
             current_epoch=epoch,
         )
 
+        reward = -cost(state, decode_bitmatrix_to_routes(y_true))
+
         x = if two_dimensional_features
             compute_2D_features(state, env.instance)
         else
             compute_features(state, env.instance)
         end
 
-        return DataSample(; instance=state, y_true, x)
+        return DataSample(; instance=(; state, reward), y_true, x)
     end
 
     return obj, dataset
diff --git a/src/DynamicVehicleScheduling/plot.jl b/src/DynamicVehicleScheduling/plot.jl
@@ -155,7 +155,7 @@ function plot_routes(
     state::DVSPState,
     routes::Vector{Vector{Int}};
     route_color=nothing,
-    route_linewidth=2,  # Increased from 2 to 3
+    route_linewidth=2,
     route_alpha=0.8,
     kwargs...,
 )
@@ -191,36 +191,6 @@ function plot_routes(
     return fig
 end
 
-# """
-# $TYPEDSIGNATURES
-
-# Plot a given DVSPState with routes overlaid. This version accepts routes as a single
-# vector where routes are separated by depot visits (index 1).
-# """
-# function plot_routes(state::DVSPState, routes::Vector{Int}; kwargs...)
-#     # Convert single route vector to vector of route vectors
-#     route_vectors = Vector{Int}[]
-#     current_route = Int[]
-
-#     for location in routes
-#         if location == 1  # Depot visit indicates end of route
-#             if !isempty(current_route)
-#                 push!(route_vectors, copy(current_route))
-#                 empty!(current_route)
-#             end
-#         else
-#             push!(current_route, location)
-#         end
-#     end
-
-#     # Add the last route if it doesn't end with depot
-#     if !isempty(current_route)
-#         push!(route_vectors, current_route)
-#     end
-
-#     return plot_routes(state, route_vectors; kwargs...)
-# end
-
 """
 $TYPEDSIGNATURES
 
diff --git a/src/Utils/policy.jl b/src/Utils/policy.jl
@@ -31,37 +31,31 @@ $TYPEDSIGNATURES
 Run the policy on the environment and return the total reward and a dataset of observations.
 By default, the environment is reset before running the policy.
 """
-function evaluate_policy!(policy, env::AbstractEnvironment; kwargs...)
+function evaluate_policy!(
+    policy, env::AbstractEnvironment; reset_env=true, seed=get_seed(env), kwargs...
+)
+    if reset_env
+        reset!(env; reset_rng=true, seed=seed)
+    end
     total_reward = 0.0
     local labeled_dataset
     while !is_terminated(env)
         y = policy(env; kwargs...)
         features, state = observe(env)
+        reward = step!(env, y)
+        sample = DataSample(;
+            x=features, y_true=y, instance=(; state=deepcopy(state), reward)
+        )
         if @isdefined labeled_dataset
-            push!(
-                labeled_dataset,
-                DataSample(; x=features, y_true=y, instance=deepcopy(state)),
-            )
+            push!(labeled_dataset, sample)
         else
-            labeled_dataset = [DataSample(; x=features, y_true=y, instance=deepcopy(state))]
+            labeled_dataset = [sample]
         end
-        reward = step!(env, y)
         total_reward += reward
     end
     return total_reward, labeled_dataset
 end
 
-# function evaluate_policy!(policy, envs::Vector{<:AbstractEnvironment}; kwargs...)
-#     E = length(envs)
-#     rewards = zeros(Float64, E)
-#     datasets = map(1:E) do e
-#         reward, dataset = evaluate_policy!(policy, envs[e]; kwargs...)
-#         rewards[e] = reward
-#         return dataset
-#     end
-#     return rewards, vcat(datasets...)
-# end
-
 """
 $TYPEDSIGNATURES
 
@@ -73,8 +67,12 @@ function evaluate_policy!(
 )
     total_reward = 0.0
     datasets = map(1:episodes) do _i
-        reset!(env; reset_rng=(_i == 1))
-        reward, dataset = evaluate_policy!(policy, env; kwargs...)
+        if _i == 1
+            reset!(env; reset_rng=true, seed=seed)
+        else
+            reset!(env; reset_rng=false)
+        end
+        reward, dataset = evaluate_policy!(policy, env; reset_env=false, kwargs...)
         total_reward += reward
         return dataset
     end
diff --git a/whale_shark_128786.mp4 b/whale_shark_128786.mp4