Support clipping in Boltzmann policies.

ztangent · ztangent · commit d63ad9bf2380 · 2025-04-21T11:29:19.000-04:00
diff --git a/src/solutions/boltzmann_policy.jl b/src/solutions/boltzmann_policy.jl
@@ -1,7 +1,7 @@
 export BoltzmannPolicy, BoltzmannMixturePolicy
 
 """
-    BoltzmannPolicy(policy, temperature, [rng::AbstractRNG])
+    BoltzmannPolicy(policy, temperature, [clip_threshold, rng::AbstractRNG])
 
 Policy that samples actions according to a Boltzmann distribution with the 
 specified `temperature`. The unnormalized log probability of taking an action
@@ -16,6 +16,11 @@ Higher temperatures lead to an increasingly random policy, whereas a temperature
 of zero corresponds to a deterministic policy. Q-values are computed according
 to the underlying `policy` provided as an argument to the constructor.
 
+A `clip_threshold` can be specified as a negative value, so that
+``[Q(s, a) - Q_max] / T`` is clipped to a minimum of `clip_threshold`. This
+prevents zero-probabilty actions, and ensures that the relative odds of the
+most vs. least probable action is no smaller than `exp(clip_threshold)`.
+
 Note that wrapping an existing policy in a `BoltzmannPolicy` does not ensure
 consistency of the state values ``V`` and Q-values ``Q`` according to the 
 Bellman equation, since this would require repeated Bellman updates to ensure
@@ -24,22 +29,25 @@ convergence.
 @auto_hash_equals struct BoltzmannPolicy{P, R <: AbstractRNG} <: PolicySolution
     policy::P
     temperature::Float64
+    clip_threshold::Union{Float64, Nothing}
     rng::R
 end
 
-BoltzmannPolicy(policy::BoltzmannPolicy, temperature, rng) =
-    BoltzmannPolicy(policy.policy, temperature, rng)
-
-BoltzmannPolicy(policy, temperature) =
-    BoltzmannPolicy(policy, temperature, Random.GLOBAL_RNG)
+BoltzmannPolicy(policy::BoltzmannPolicy, temperature, clip_threshold, rng) =
+    BoltzmannPolicy(policy.policy, temperature, clip_threshold, rng)
+BoltzmannPolicy(policy, temperature, clip_threshold = nothing) =
+    BoltzmannPolicy(policy, temperature, clip_threshold, Random.GLOBAL_RNG)
+BoltzmannPolicy(policy, temperature, rng::AbstractRNG) =
+    BoltzmannPolicy(policy, temperature, nothing, rng)
 
 function Base.show(io::IO, ::MIME"text/plain", sol::BoltzmannPolicy)
     indent = get(io, :indent, "")
     show_struct(io, sol; indent = indent, show_fields=(:policy,))
 end
     
 Base.copy(sol::BoltzmannPolicy) =
-    BoltzmannPolicy(copy(sol.policy), sol.temperature, sol.rng)
+    BoltzmannPolicy(copy(sol.policy), sol.temperature,
+                    sol.clip_threshold, sol.rng)
 
 get_action(sol::BoltzmannPolicy, state::State) =
     rand_action(sol, state)
@@ -63,7 +71,7 @@ has_cached_action_values(sol::BoltzmannPolicy, state::State) =
 function rand_action(sol::BoltzmannPolicy, state::State)
     if sol.temperature == 0
         # Reservoir sampling among maximal elements
-        qs = get_action_values(sol.policy, state)
+        qs = get_action_values(sol, state)
         if isempty(qs) return missing end
         q_max = maximum(values(qs))
         n_max = 0
@@ -77,10 +85,18 @@ function rand_action(sol::BoltzmannPolicy, state::State)
         return chosen_act
     else
         # Reservoir sampling via Gumbel-max trick
+        qs = get_action_values(sol, state)
+        isempty(qs) && return missing
+        if !isnothing(sol.clip_threshold)
+            max_score = maximum(values(qs)) / sol.temperature
+        end   
         chosen_act, chosen_score = missing, -Inf
-        for (act, q) in get_action_values(sol, state)
-            score = q / sol.temperature + randgumbel(sol.rng)
-            if score > chosen_score
+        for (act, q) in qs
+            score = q / sol.temperature
+            if !isnothing(sol.clip_threshold)
+                score = max(score, max_score - sol.clip_threshold)
+            end
+            if score + randgumbel(sol.rng) > chosen_score
                 chosen_act = act
                 chosen_score = score
             end
@@ -100,7 +116,7 @@ function get_action_probs(sol::BoltzmannPolicy, state::State)
         n_max = sum(q >= q_max for q in q_values)
         probs = [q >= q_max ? 1.0 / n_max : 0.0 for q in q_values]
     else
-        probs = softmax(q_values ./ sol.temperature)
+        probs = softmax(q_values ./ sol.temperature, sol.clip_threshold)
     end 
     probs = Dict(zip(actions, probs))
     return probs
@@ -118,7 +134,7 @@ function get_action_prob(sol::BoltzmannPolicy, state::State, action::Term)
         q_act = get(action_values, action, -Inf)
         return q_act >= q_max ? 1.0 / n_max : 0.0
     else
-        probs = softmax(q_values ./ sol.temperature)
+        probs = softmax(q_values ./ sol.temperature, sol.clip_threshold)
         for (a, p) in zip(actions, probs)
             a == action && return p
         end
@@ -127,42 +143,50 @@ function get_action_prob(sol::BoltzmannPolicy, state::State, action::Term)
 end
 
 """
-    BoltzmannMixturePolicy(policy, temperatures, [weights, rng::AbstractRNG])
+    BoltzmannMixturePolicy(policy, temperatures, [weights,]
+                           [clip_threshold, rng::AbstractRNG])
 
 A mixture of Boltzmann policies with different `temperatures` and mixture 
 `weights`, specified as `Vector`s. If provided, `weights` must be non-negative
 and sum to one. Otherwise a uniform mixture is assumed. Q-values are computed
 according to the underlying `policy` provided as an argument to the constructor.
+
+Similar to the [`BoltzmannPolicy`](@ref), `clip_threshold` can be used to
+prevent zero-probability actions.
 """
 @auto_hash_equals struct BoltzmannMixturePolicy{P, R <: AbstractRNG} <: PolicySolution
     policy::P
     temperatures::Vector{Float64}
     weights::Vector{Float64}
+    clip_threshold::Union{Float64, Nothing}
     rng::R
     function BoltzmannMixturePolicy{P, R}(
-        policy::P, temperatures, weights, rng::R
+        policy::P, temperatures, weights, clip_threshold, rng::R
     ) where {P, R <: AbstractRNG}
         @assert length(temperatures) == length(weights)
         @assert all(w >= 0 for w in weights)
         @assert isapprox(sum(weights), 1.0)
         temperatures = convert(Vector{Float64}, temperatures)
         weights = convert(Vector{Float64}, weights)
-        return new(policy, temperatures, weights, rng)
+        return new(policy, temperatures, weights, clip_threshold, rng)
     end
 end
 
 function BoltzmannMixturePolicy(
     policy::P,
     temperatures,
     weights = ones(length(temperatures)) ./ length(temperatures),
+    clip_threshold = nothing,
     rng::R = Random.GLOBAL_RNG
 ) where {P, R <: AbstractRNG}
-    return BoltzmannMixturePolicy{P, R}(policy, temperatures, weights, rng)
+    return BoltzmannMixturePolicy{P, R}(
+        policy, temperatures, weights, clip_threshold, rng
+    )
 end
 
 function BoltzmannMixturePolicy(policy, temperatures, rng::AbstractRNG)
     weights = ones(length(temperatures)) ./ length(temperatures)
-    return BoltzmannMixturePolicy(policy, temperatures, weights, rng)
+    return BoltzmannMixturePolicy(policy, temperatures, weights, nothing, rng)
 end
 
 function Base.show(io::IO, ::MIME"text/plain", sol::BoltzmannMixturePolicy)
@@ -173,7 +197,7 @@ end
 
 Base.copy(sol::BoltzmannMixturePolicy) =
     BoltzmannMixturePolicy(copy(sol.policy), copy(sol.temperatures),
-                           copy(sol.weights), sol.rng)
+                           copy(sol.weights), copy(sol.clip_threshold), sol.rng)
 
 get_action(sol::BoltzmannMixturePolicy, state::State) =
     rand_action(sol, state)
@@ -196,7 +220,7 @@ has_cached_action_values(sol::BoltzmannMixturePolicy, state::State) =
 
 function rand_action(sol::BoltzmannMixturePolicy, state::State)
     temperature = sample(sol.rng, sol.temperatures, Weights(sol.weights))
-    policy = BoltzmannPolicy(sol.policy, temperature, sol.rng)
+    policy = BoltzmannPolicy(sol.policy, temperature, sol.clip_threshold, sol.rng)
     return rand_action(policy, state)
 end
 
@@ -216,7 +240,7 @@ function get_action_probs(sol::BoltzmannMixturePolicy, state::State)
                 probs[i] += weight / n_max
             end
         else
-            probs .+= softmax(q_values ./ temp) .* weight
+            probs .+= softmax(q_values ./ temp, sol.clip_threshold) .* weight
         end
     end
     probs = Dict(zip(actions, probs))
@@ -239,7 +263,7 @@ function get_action_prob(sol::BoltzmannMixturePolicy,
             q_act = q_values[act_idx]
             act_prob += q_act >= q_max ? weight / n_max : 0.0
         else
-            probs = softmax(q_values ./ temp)
+            probs = softmax(q_values ./ temp, sol.clip_threshold)
             act_prob += probs[act_idx] * weight
         end
     end
@@ -267,7 +291,7 @@ function get_mixture_weights(sol::BoltzmannMixturePolicy,
             q_act = q_values[act_idx]
             return q_act >= q_max ? weight / n_max : 0.0
         else
-            probs = softmax(q_values ./ temp)
+            probs = softmax(q_values ./ temp, sol.clip_threshold)
             return probs[act_idx] * weight
         end
     end
diff --git a/src/utils.jl b/src/utils.jl
@@ -66,9 +66,13 @@ function auto_equals_expr(type, fields)
 end
 
 "Convert vector of unnormalized scores to probabiities."
-function softmax(scores)
+function softmax(scores, min_rel_score=nothing)
     if isempty(scores) return Float64[] end
-    ws = exp.(scores .- maximum(scores))
+    rel_scores = scores .- maximum(scores)
+    if !isnothing(min_rel_score)
+        rel_scores = max.(rel_scores, min_rel_score)
+    end
+    ws = exp.(rel_scores)
     z = sum(ws)
     return isnan(z) ? ones(length(scores)) ./ length(scores) : ws ./ z
 end
diff --git a/test/solutions.jl b/test/solutions.jl
@@ -321,17 +321,26 @@ act_prob = probs[pddl"(pick-up a)"]
 @test get_action_prob(sol, bw_state, pddl"(pick-up a)") ≈ act_prob
 @test get_action_prob(sol, bw_state, pddl"(pick-up z)") == 0.0
 
-sol = TabularPolicy()
-sol.V[hash(bw_state)] = bw_init_v
-sol.Q[hash(bw_state)] = copy(bw_init_q)
-sol.Q[hash(bw_state)][pddl"(pick-up a)"] = -Inf
-sol = BoltzmannPolicy(sol, 0.0)
+v_sol = TabularPolicy()
+v_sol.V[hash(bw_state)] = bw_init_v
+v_sol.Q[hash(bw_state)] = copy(bw_init_q)
+v_sol.Q[hash(bw_state)][pddl"(pick-up a)"] = -Inf
+
+sol = BoltzmannPolicy(v_sol, 0.0)
 probs = Dict(a => a == pddl"(pick-up a)" ? 0.0 : 0.5 for a in bw_init_actions)
 @test all(probs[a] ≈ p for (a, p) in get_action_probs(sol, bw_state))
 @test get_action_prob(sol, bw_state, pddl"(pick-up a)") == 0.0
 @test get_action_prob(sol, bw_state, pddl"(pick-up b)") == 0.5
 @test get_action_prob(sol, bw_state, pddl"(pick-up c)") == 0.5
 
+sol = BoltzmannPolicy(v_sol, 2.0, -8.0)
+prob_a = get_action_prob(sol, bw_state, pddl"(pick-up a)")
+prob_b = get_action_prob(sol, bw_state, pddl"(pick-up b)")
+prob_c = get_action_prob(sol, bw_state, pddl"(pick-up c)")
+@test log(prob_a) - log(prob_b) ≈ -8.0
+@test log(prob_a) - log(prob_c) ≈ -8.0
+@test log(prob_b) - log(prob_c) ≈ 0.0
+
 @test has_values(sol) == true
 
 @test copy(sol) == sol
@@ -373,11 +382,12 @@ new_weights = [0.4 * probs_1[act], 0.6 * probs_2[act]]
 new_weights = new_weights ./ sum(new_weights)
 @test get_mixture_weights(sol, bw_state, act) ≈ new_weights
 
-sol = TabularPolicy()
-sol.V[hash(bw_state)] = bw_init_v
-sol.Q[hash(bw_state)] = copy(bw_init_q)
-sol.Q[hash(bw_state)][pddl"(pick-up a)"] = -6.0 - log(2)
-sol = BoltzmannMixturePolicy(sol, [0.0, 1.0])
+v_sol = TabularPolicy()
+v_sol.V[hash(bw_state)] = bw_init_v
+v_sol.Q[hash(bw_state)] = copy(bw_init_q)
+
+sol = BoltzmannMixturePolicy(v_sol, [0.0, 1.0])
+v_sol.Q[hash(bw_state)][pddl"(pick-up a)"] = -6.0 - log(2)
 probs = Dict(a => a == pddl"(pick-up a)" ? 0.1 : 0.45 for a in bw_init_actions)
 @test all(probs[a] ≈ p for (a, p) in get_action_probs(sol, bw_state))
 @test get_action_prob(sol, bw_state, pddl"(pick-up a)") ≈ 0.1
@@ -387,6 +397,15 @@ new_weights = [0.5, 0.4] .* get_mixture_weights(sol)
 new_weights = new_weights ./ sum(new_weights)
 @test get_mixture_weights(sol, bw_state, pddl"(pick-up b)") ≈ new_weights
 
+sol = BoltzmannMixturePolicy(v_sol, [1.0, 2.0], [0.25, 0.75], -8.0)
+v_sol.Q[hash(bw_state)][pddl"(pick-up a)"] = -Inf
+prob_a = get_action_prob(sol, bw_state, pddl"(pick-up a)")
+prob_b = get_action_prob(sol, bw_state, pddl"(pick-up b)")
+prob_c = get_action_prob(sol, bw_state, pddl"(pick-up c)")
+@test log(prob_a) - log(prob_b) ≈ -8.0
+@test log(prob_a) - log(prob_c) ≈ -8.0
+@test log(prob_b) - log(prob_c) ≈ 0.0
+
 @test has_values(sol) == true
 
 @test copy(sol) == sol