Implement generate_sample interface

BatyLeo · BatyLeo · commit 5a2e85274c9f · 2025-07-03T09:44:53.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -13,6 +13,8 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
 Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
@@ -37,6 +39,8 @@ Graphs = "1.11"
 HiGHS = "1.9"
 Images = "0.26.1"
 Ipopt = "1.6"
+IterTools = "1.10.0"
+JSON = "0.21.4"
 JuMP = "1.22"
 LinearAlgebra = "1"
 Metalhead = "0.9.4"
diff --git a/src/Argmax/Argmax.jl b/src/Argmax/Argmax.jl
@@ -62,25 +62,39 @@ end
 
 """
 $TYPEDSIGNATURES
-
-Generate a dataset of labeled instances for the argmax problem.
 """
-function Utils.generate_dataset(
-    bench::ArgmaxBenchmark, dataset_size::Int=10; seed::Int=0, noise_std=0.0
+function Utils.generate_sample(
+    bench::ArgmaxBenchmark, rng::AbstractRNG; noise_std::Float32=0.0f0
 )
     (; instance_dim, nb_features, encoder) = bench
-    rng = MersenneTwister(seed)
-    features = [randn(rng, Float32, nb_features, instance_dim) for _ in 1:dataset_size]
-    costs = encoder.(features)
-    noisy_solutions = [
-        one_hot_argmax(θ + noise_std * randn(rng, Float32, instance_dim)) for θ in costs
-    ]
-    return [
-        DataSample(; x, θ_true, y_true) for
-        (x, θ_true, y_true) in zip(features, costs, noisy_solutions)
-    ]
+    features = randn(rng, Float32, nb_features, instance_dim)
+    costs = encoder(features)
+    noisy_solution = one_hot_argmax(costs + noise_std * randn(rng, Float32, instance_dim))
+    return DataSample(; x=features, θ_true=costs, y_true=noisy_solution)
 end
 
+# """
+# $TYPEDSIGNATURES
+
+# Generate a dataset of labeled instances for the argmax problem.
+# """
+# function Utils.generate_dataset(
+#     bench::ArgmaxBenchmark, dataset_size::Int; noise_std=0.0, kwargs...
+# )
+#     return Utils.generate_dataset(bench, dataset_size; noise_std=noise_std, kwargs...)
+#     # (; instance_dim, nb_features, encoder) = bench
+#     # rng = MersenneTwister(seed)
+#     # features = [randn(rng, Float32, nb_features, instance_dim) for _ in 1:dataset_size]
+#     # costs = encoder.(features)
+#     # noisy_solutions = [
+#     #     one_hot_argmax(θ + noise_std * randn(rng, Float32, instance_dim)) for θ in costs
+#     # ]
+#     # return [
+#     #     DataSample(; x, θ_true, y_true) for
+#     #     (x, θ_true, y_true) in zip(features, costs, noisy_solutions)
+#     # ]
+# end
+
 """
 $TYPEDSIGNATURES
 
diff --git a/src/DecisionFocusedLearningBenchmarks.jl b/src/DecisionFocusedLearningBenchmarks.jl
@@ -54,7 +54,7 @@ include("Warcraft/Warcraft.jl")
 include("FixedSizeShortestPath/FixedSizeShortestPath.jl")
 include("PortfolioOptimization/PortfolioOptimization.jl")
 include("StochasticVehicleScheduling/StochasticVehicleScheduling.jl")
-include("DynamicVehicleScheduling/DynamicVehicleScheduling.jl")
+# include("DynamicVehicleScheduling/DynamicVehicleScheduling.jl")
 
 using .Utils
 using .Argmax
@@ -64,10 +64,10 @@ using .Warcraft
 using .FixedSizeShortestPath
 using .PortfolioOptimization
 using .StochasticVehicleScheduling
-using .DynamicVehicleScheduling
+# using .DynamicVehicleScheduling
 
 # Interface
-export AbstractBenchmark, DataSample
+export AbstractBenchmark, AbstractStochasticBenchmark, AbstractDynamicBenchmark, DataSample
 export generate_dataset
 export generate_statistical_model
 export generate_maximizer, maximizer_kwargs
diff --git a/src/DynamicVehicleScheduling/DynamicVSP/algorithms/prize_collecting_vsp.jl b/src/DynamicVehicleScheduling/DynamicVSP/algorithms/prize_collecting_vsp.jl
@@ -205,11 +205,11 @@ function _objective_value(θ, routes; instance)
     return -total, g
 end
 
-function ChainRulesCore.rrule(::typeof(my_objective_value), θ, routes; instance)
-    total, g = _objective_value(θ, routes; instance)
-    function pullback(dy)
-        g = g .* dy
-        return NoTangent(), g, NoTangent()
-    end
-    return total, pullback
-end
+# function ChainRulesCore.rrule(::typeof(my_objective_value), θ, routes; instance)
+#     total, g = _objective_value(θ, routes; instance)
+#     function pullback(dy)
+#         g = g .* dy
+#         return NoTangent(), g, NoTangent()
+#     end
+#     return total, pullback
+# end
diff --git a/src/DynamicVehicleScheduling/DynamicVehicleScheduling.jl b/src/DynamicVehicleScheduling/DynamicVehicleScheduling.jl
@@ -7,7 +7,7 @@ using Base: @kwdef
 using DocStringExtensions: TYPEDEF, TYPEDFIELDS, TYPEDSIGNATURES
 using Graphs
 using HiGHS
-using InferOpt
+# using InferOpt
 using IterTools: partition
 using JSON
 using JuMP
diff --git a/src/FixedSizeShortestPath/FixedSizeShortestPath.jl b/src/FixedSizeShortestPath/FixedSizeShortestPath.jl
@@ -103,45 +103,24 @@ function Utils.generate_maximizer(bench::FixedSizeShortestPathBenchmark; use_dij
     return shortest_path_maximizer
 end
 
-"""
-$TYPEDSIGNATURES
-
-Generate dataset for the shortest path problem.
-"""
-function Utils.generate_dataset(
-    bench::FixedSizeShortestPathBenchmark,
-    dataset_size::Int=10;
-    seed::Int=0,
-    type::Type=Float32,
+function Utils.generate_sample(
+    bench::FixedSizeShortestPathBenchmark, rng::AbstractRNG; type::Type=Float32
 )
-    # Set seed
-    rng = MersenneTwister(seed)
     (; graph, p, deg, ν) = bench
-
+    features = randn(rng, Float32, bench.p)
     E = Graphs.ne(graph)
-
-    # Features
-    features = [randn(rng, type, p) for _ in 1:dataset_size]
-
     # True weights
     B = rand(rng, Bernoulli(0.5), E, p)
     ξ = if ν == 0.0
-        [ones(type, E) for _ in 1:dataset_size]
+        ones(type, E)
     else
-        [rand(rng, Uniform{type}(1 - ν, 1 + ν), E) for _ in 1:dataset_size]
+        rand(rng, Uniform{type}(1 - ν, 1 + ν), E)
     end
-    costs = [
-        -(1 .+ (3 .+ B * zᵢ ./ type(sqrt(p))) .^ deg) .* ξᵢ for (ξᵢ, zᵢ) in zip(ξ, features)
-    ]
-
-    shortest_path_maximizer = Utils.generate_maximizer(bench)
-
-    # Label solutions
-    solutions = shortest_path_maximizer.(costs)
-    return [
-        DataSample(; x, θ_true, y_true) for
-        (x, θ_true, y_true) in zip(features, costs, solutions)
-    ]
+    costs = -(1 .+ (3 .+ B * features ./ type(sqrt(p))) .^ deg) .* ξ
+
+    maximizer = Utils.generate_maximizer(bench)
+    solution = maximizer(costs)
+    return DataSample(; x=features, θ_true=costs, y_true=solution)
 end
 
 """
diff --git a/src/PortfolioOptimization/PortfolioOptimization.jl b/src/PortfolioOptimization/PortfolioOptimization.jl
@@ -7,7 +7,7 @@ using Flux: Chain, Dense
 using Ipopt: Ipopt
 using JuMP: @variable, @objective, @constraint, optimize!, value, Model, set_silent
 using LinearAlgebra: I
-using Random: MersenneTwister
+using Random: AbstractRNG, MersenneTwister
 
 """
 $TYPEDEF
@@ -82,6 +82,21 @@ function Utils.generate_maximizer(bench::PortfolioOptimizationBenchmark)
     return portfolio_maximizer
 end
 
+function Utils.generate_sample(
+    bench::PortfolioOptimizationBenchmark, rng::AbstractRNG; type::Type=Float32
+)
+    (; d, p, deg, ν, L, f) = bench
+    features = randn(rng, type, p, d)
+    B = rand(rng, Bernoulli(0.5), d, p)
+    c̄ = (0.05 / type(sqrt(p)) .* B * features .+ 0.1^(1 / deg)) .^ deg
+    costs = c̄ .+ L * f .+ 0.01 * ν * randn(rng, type, d)
+
+    maximizer = Utils.generate_maximizer(bench)
+    solution = maximizer(costs)
+
+    return DataSample(; x=features, θ_true=c̄, y_true=solution)
+end
+
 """
 $TYPEDSIGNATURES
 
diff --git a/src/Ranking/Ranking.jl b/src/Ranking/Ranking.jl
@@ -61,22 +61,16 @@ end
 """
 $TYPEDSIGNATURES
 
-Generate a dataset of labeled instances for the ranking problem.
+Generate a labeled sample for the ranking problem.
 """
-function Utils.generate_dataset(
-    bench::RankingBenchmark, dataset_size::Int=10; seed::Int=0, noise_std=0.0
+function Utils.generate_sample(
+    bench::RankingBenchmark, rng::AbstractRNG; noise_std::Float32=0.0f0
 )
     (; instance_dim, nb_features, encoder) = bench
-    rng = MersenneTwister(seed)
-    features = [randn(rng, Float32, nb_features, instance_dim) for _ in 1:dataset_size]
-    costs = encoder.(features)
-    noisy_solutions = [
-        ranking(θ .+ noise_std * randn(rng, Float32, instance_dim)) for θ in costs
-    ]
-    return [
-        DataSample(; x, θ_true, y_true) for
-        (x, θ_true, y_true) in zip(features, costs, noisy_solutions)
-    ]
+    features = randn(rng, Float32, nb_features, instance_dim)
+    costs = encoder(features)
+    noisy_solution = ranking(costs .+ noise_std * randn(rng, Float32, instance_dim))
+    return DataSample(; x=features, θ_true=costs, y_true=noisy_solution)
 end
 
 """
diff --git a/src/StochasticVehicleScheduling/StochasticVehicleScheduling.jl b/src/StochasticVehicleScheduling/StochasticVehicleScheduling.jl
@@ -73,43 +73,32 @@ end
 """
 $TYPEDSIGNATURES
 
-Create a dataset of `dataset_size` instances for the given `StochasticVehicleSchedulingBenchmark`.
-If you want to not add label solutions in the dataset, set `compute_solutions=false`.
+Generate a sample for the given `StochasticVehicleSchedulingBenchmark`.
+If you want to not add label solutions in the sample, set `compute_solutions=false`.
 By default, they will be computed using column generation.
 Note that computing solutions can be time-consuming, especially for large instances.
 You can also use instead `compact_mip` or `compact_linearized_mip` as the algorithm to compute solutions.
 If you want to provide a custom algorithm to compute solutions, you can pass it as the `algorithm` keyword argument.
 If `algorithm` takes keyword arguments, you can pass them as well directly in `kwargs...`.
-If `store_city=false`, the coordinates and unnecessary information about instances will not be stored in the dataset.
+If `store_city=false`, the coordinates and unnecessary information about instances will not be stored in the sample.
 """
-function Utils.generate_dataset(
+function Utils.generate_sample(
     benchmark::StochasticVehicleSchedulingBenchmark,
-    dataset_size::Int;
+    rng::AbstractRNG;
+    store_city=true,
     compute_solutions=true,
-    seed=nothing,
-    rng=MersenneTwister(0),
     algorithm=column_generation_algorithm,
-    store_city=true,
     kwargs...,
 )
     (; nb_tasks, nb_scenarios) = benchmark
-    Random.seed!(rng, seed)
-    instances = [
-        Instance(; nb_tasks, nb_scenarios, rng, store_city) for _ in 1:dataset_size
-    ]
-    features = get_features.(instances)
-    if compute_solutions
-        solutions = [algorithm(instance; kwargs...).value for instance in instances]
-        return [
-            DataSample(; x=feature, instance, y_true=solution) for
-            (instance, feature, solution) in zip(instances, features, solutions)
-        ]
+    instance = Instance(; nb_tasks, nb_scenarios, rng, store_city)
+    x = get_features(instance)
+    y_true = if compute_solutions
+        algorithm(instance; kwargs...).value  # TODO: modify algorithms to directly return the solution
+    else
+        nothing
     end
-    # else
-    return [
-        DataSample(; x=feature, instance) for
-        (instance, feature) in zip(instances, features)
-    ]
+    return DataSample(; x, instance, y_true)
 end
 
 """
@@ -126,7 +115,7 @@ end
 $TYPEDSIGNATURES
 """
 function Utils.generate_maximizer(
-    bench::StochasticVehicleSchedulingBenchmark; model_builder=highs_model
+    ::StochasticVehicleSchedulingBenchmark; model_builder=highs_model
 )
     return StochasticVechicleSchedulingMaximizer(model_builder)
 end
diff --git a/src/SubsetSelection/SubsetSelection.jl b/src/SubsetSelection/SubsetSelection.jl
@@ -17,21 +17,28 @@ without knowing their values, but only observing some features.
 # Fields
 $TYPEDFIELDS
 """
-struct SubsetSelectionBenchmark <: AbstractBenchmark
+struct SubsetSelectionBenchmark{M} <: AbstractBenchmark
     "total number of items"
     n::Int
     "number of items to select"
     k::Int
+    "hidden unknown mapping from features to costs"
+    mapping::M
 end
 
 function Base.show(io::IO, bench::SubsetSelectionBenchmark)
     (; n, k) = bench
     return print(io, "SubsetSelectionBenchmark(n=$n, k=$k)")
 end
 
-function SubsetSelectionBenchmark(; n::Int=25, k::Int=5)
+function SubsetSelectionBenchmark(; n::Int=25, k::Int=5, identity_mapping::Bool=true)
     @assert n >= k "number of items n must be greater than k"
-    return SubsetSelectionBenchmark(n, k)
+    mapping = if identity_mapping
+        copy
+    else
+        Dense(n => n; bias=false)
+    end
+    return SubsetSelectionBenchmark(n, k, mapping)
 end
 
 function top_k(v::AbstractVector, k::Int)
@@ -54,29 +61,14 @@ end
 """
 $TYPEDSIGNATURES
 
-Generate a dataset of labeled instances for the subset selection problem.
-The mapping between features and cost is identity.
+Generate a labeled instance for the subset selection problem.
 """
-function Utils.generate_dataset(
-    bench::SubsetSelectionBenchmark,
-    dataset_size::Int=10;
-    seed::Int=0,
-    identity_mapping=true,
-)
-    (; n, k) = bench
-    rng = MersenneTwister(seed)
-    features = [randn(rng, Float32, n) for _ in 1:dataset_size]
-    costs = if identity_mapping
-        copy(features)  # we assume that the cost is the same as the feature
-    else
-        mapping = Dense(n => n; bias=false)
-        mapping.(features)
-    end
-    solutions = top_k.(costs, k)
-    return [
-        DataSample(; x, θ_true, y_true) for
-        (x, θ_true, y_true) in zip(features, costs, solutions)
-    ]
+function Utils.generate_sample(bench::SubsetSelectionBenchmark, rng::AbstractRNG)
+    (; n, k, mapping) = bench
+    features = randn(rng, Float32, n)
+    costs = mapping(features)
+    solution = top_k(costs, k)
+    return DataSample(; x=features, θ_true=costs, y_true=solution)
 end
 
 """
diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl
@@ -5,6 +5,7 @@ using Flux: softplus
 using HiGHS: HiGHS
 using JuMP: Model
 using LinearAlgebra: dot
+using Random: Random, MersenneTwister
 using SCIP: SCIP
 using SimpleWeightedGraphs: SimpleWeightedDiGraph
 using StatsBase: StatsBase
@@ -18,9 +19,9 @@ include("model_builders.jl")
 
 export DataSample
 
-export AbstractBenchmark
-export generate_dataset,
-    generate_statistical_model, generate_maximizer, plot_data, compute_gap
+export AbstractBenchmark, AbstractStochasticBenchmark, AbstractDynamicBenchmark
+export generate_dataset, generate_statistical_model, generate_maximizer, generate_sample
+export plot_data, compute_gap
 export maximizer_kwargs
 export grid_graph, get_path, path_to_matrix
 export neg_tensor, squeeze_last_dims, average_tensor
diff --git a/src/Utils/interface.jl b/src/Utils/interface.jl
diff --git a/test/subset_selection.jl b/test/subset_selection.jl