edit pomdp models

SidhartK · SidhartK · commit 0d30ee1b7a65 · 2021-04-24T18:32:23.000-07:00
diff --git a/docs/src/mdp.md b/docs/src/mdp.md
@@ -1,6 +1,6 @@
-# Markov Decision Process
+# MDP Usage
 
-## MDP Struct
+## MDP
 The MDP struct gives the following:
  - `γ`: discount factor
  - `𝒮`: state space
@@ -9,53 +9,4 @@ The MDP struct gives the following:
  - `R`: reward function
  - `TR`: function allows us to sample transition and reward
 
-## DiscreteMDP Struct
-The DiscreteMDP struct gives the following objects and methods:
- - `ordered_states(m::DiscreteMDP)`: gives a vector of states
- - `ordered_actions(m::DiscreteMDP)`: gives a vector of actions
- - `T`: Matrix of transition function T(s,a,s′)
-    - `transition(m::DiscreteMDP, s::Int, a::Int)`: function that gives a distribution of the transition
-    - `generate_s(m::DiscreteMDP, s::Int, a::Int)`: function that samples the state from a transition
- - `R`: Matrix of reward values R(s,a) = ∑_s' R(s,a,s')*T(s,a,s′)
-    - `reward(m::DiscreteMDP, s::Int, a::Int)`: function gives the reward of a state and action pair
- - `γ`: Discount factor
-
-## Cart Pole, Mountain Car, Simple LQR
-These problems all have similar usage documentation. To build an instance of one of these problems run
-```julia
-m = Problem()
-mdp = MDP(m)
-```
-where `Problem` is either replaced with `CartPole`, `MountainCar`, or `LqrMDP`. Then `mdp` is a MDP struct so we get access to all of the functions describe in the MDP Struct section.
-
-## Hex World
-For Hex World, you use the DiscreteMDP struct. You can either set up the HexWorld manually by calling
-```julia
-m = HexWorldMDP(hexes, HexWorldRBumpBorder, HexWorldPIntended, special_hex_rewards, HexWorldDiscountFactor)
-```
-where `HexWorldRBumpBorder`, `HexWorldPIntended` and `HexWorldDiscountFactor` are constants, `hexes` is a list of 2 dimensional coordinates and `special_hex_rewards` is a dictionary of all nonzero rewards.
-It is also possible to use one of the preset MDPs:
-```julia
-m = HexWorld
-m = StraightLineHexWorld
-```
-Then running
-```julia
-mdp = m.mdp
-```
-gives an instance of a DiscreteMDP struct.
-
-## Collision Avoidance
-To create an instance of the problem, run
-```julia
-m = CollisionAvoidanceMDP()
-```
-Each of the state are instances of the struct `CollissionAvoidanceMDPState` that have the objects
-- `h`: vertical separation
-- `dh`: rate of change in h
-- `a_prev`: last action
-- `τ`: horizontal time separation
-Then you the CollisionAvoidanceMDP struct has the methods:
-- `transition(𝒫::CollisionAvoidanceMDP, s::CollisionAvoidanceMDPState, a::Float64)`: returns a distribution of states which can be sampled
-- `is_terminal(𝒫::CollisionAvoidanceMDP, s::CollisionAvoidanceMDPState)`: determines if the state is terminal
-- `reward(𝒫::CollisionAvoidanceMDP, s::CollisionAvoidanceMDPState, a::Float64)`: gives the reward
+The function `T` takes in a state `s` and an action `a` and returns a distribution of states which can be sampled. The reward function `R` takes in a state `s` and action `a` and returns an reward. Finally `TR` takes in a state `s` and an action `a` and returns a tuple `(s', r)` where `s'` is the new state sampled from the transition function and `r` is the reward.
diff --git a/docs/src/pomdp.md b/docs/src/pomdp.md
@@ -0,0 +1,14 @@
+# POMDP Usage
+
+## POMDP
+The MDP struct gives the following:
+ - `γ`: discount factor
+ - `𝒮`: state space
+ - `𝒜`: action space
+ - `𝒪`: observation space
+ - `T`: transition function
+ - `R`: reward function
+ - `O`: observation function
+ - `TRO`: function that allows us to sample transition, reward, and observation
+
+The function `T` takes in a state `s` and an action `a` and returns a distribution of possible states. The reward function `R` takes in a state `s` and action `a` and returns an reward. The observation function `O` takes in a state `s` and an action `a` and returns a distribution of possible observations. Finally `TRO` takes in a state `s` and an action `a` and returns a tuple `(s', r, o)` where `s'` is the new state sampled from the transition function, `r` is the reward and `o` is an observation sampled from the observation function.
diff --git a/src/mdp/discrete_mdp.jl b/src/mdp/discrete_mdp.jl
@@ -24,4 +24,25 @@ function generate_s(mdp::DiscreteMDP, s::Int, a::Int)
     end
     return s′
 end
-reward(mdp::DiscreteMDP, s::Int, a::Int) = mdp.R[s,a]
+reward(mdp::DiscreteMDP, s::Int, a::Int) = mdp.R[s,a]
+
+function MDP(mdp::DiscreteMDP; γ::Float64=discount(mdp))
+    return MDP(
+        γ,
+        ordered_states(mdp),
+        ordered_actions(mdp),
+        (s,a, s′=nothing) -> begin
+            S′ = transition(mdp, s, a)
+            if s′ == nothing
+                return S′
+            end
+            return pdf(S′, s′)
+        end,
+        (s,a) -> reward(mdp, s, a),
+        (s, a)->begin
+            s′ = rand(transition(mdp,s,a))
+            r = reward(mdp, s, a)
+            return (s′, r)
+        end
+    )
+end
diff --git a/src/mdp/hexworld.jl b/src/mdp/hexworld.jl
@@ -174,3 +174,6 @@ end
 function DiscreteMDP(mdp::HexWorldMDP)
     return mdp.mdp
 end
+function MDP(mdp::HexWorldMDP)
+    return MDP(mdp.mdp)
+end
diff --git a/src/pomdp/catch.jl b/src/pomdp/catch.jl
@@ -44,4 +44,46 @@ function DiscretePOMDP(mdp::Catch; γ::Float64=mdp.γ)
     return DiscretePOMDP(T, R, O, γ)
 end
 
+function POMDP(mdp::Catch; γ::Float64=mdp.γ)
+    Θ = [20,40,60,80] # proficiencies
+    𝒜 = collect(10:10:100) # throw distances
+
+    nS = length(Θ)
+    nA = length(𝒜)
+    nO = 2 # catch or not
+
+    T = zeros(nS, nA, nS)
+    R = Array{Float64}(undef, nS, nA)
+    O = Array{Float64}(undef, nA, nS, nO)
+
+    o_catch = 1
+    o_drop = 2
+
+    prob_catch(d,θ) = 1 - 1/(1+exp(-(d-θ)/15))
+
+    # Transition dynamics are 100% stationary.
+    for si in 1:nS
+        for ai in 1:nA
+            T[si, ai, si] = 1.0
+        end
+    end
+
+    # Reward equal to distance caught
+    for (si,θ) in enumerate(Θ)
+        for (ai,d) in enumerate(𝒜)
+            R[si,ai] = d*prob_catch(d,θ) # distance caught times prob of catch
+        end
+    end
+
+    # Observation is based on whether we caught or not.
+    for (ai,d) in enumerate(𝒜)
+        for (si′,θ) in enumerate(Θ)
+            O[ai,si′,o_catch] = prob_catch(d,θ)
+            O[ai,si′,o_drop] = 1 - O[ai,si′,o_catch]
+        end
+    end
+
+    return POMDP(DiscretePOMDP(T, R, O, γ))
+end
+
 # const Catch = generate_catch_pomdp(0.9)
diff --git a/src/pomdp/crying_baby.jl b/src/pomdp/crying_baby.jl
@@ -137,3 +137,46 @@ function DiscretePOMDP(pomdp::BabyPOMDP; γ::Float64=pomdp.γ)
 
     return DiscretePOMDP(T, R, O, γ)
 end
+
+function POMDP(pomdp::BabyPOMDP; γ::Float64=pomdp.γ)
+    nS = n_states(pomdp)
+    nA = n_actions(pomdp)
+    nO = n_observations(pomdp)
+
+    T = zeros(nS, nA, nS)
+    R = Array{Float64}(undef, nS, nA)
+    O = Array{Float64}(undef, nA, nS, nO)
+
+    s_s = 1
+    s_h = 2
+
+    a_f = 1
+    a_i = 2
+    a_s = 3
+
+    o_c = 1
+    o_q = 2
+
+    T[s_s, a_f, :] = [1.0, 0.0]
+    T[s_s, a_i, :] = [1.0-pomdp.p_become_hungry, pomdp.p_become_hungry]
+    T[s_s, a_s, :] = [1.0-pomdp.p_become_hungry, pomdp.p_become_hungry]
+    T[s_h, a_f, :] = [1.0, 0.0]
+    T[s_h, a_i, :] = [0.0, 1.0]
+    T[s_h, a_s, :] = [0.0, 1.0]
+
+    R[s_s, a_f, :] = reward(pomdp, s_s, a_f)
+    R[s_s, a_i, :] = reward(pomdp, s_s, a_i)
+    R[s_s, a_s, :] = reward(pomdp, s_s, a_s)
+    R[s_h, a_f, :] = reward(pomdp, s_h, a_f)
+    R[s_h, a_i, :] = reward(pomdp, s_h, a_i)
+    R[s_h, a_s, :] = reward(pomdp, s_h, a_s)
+
+    O[a_f, s_s, :] = [observation(pomdp, a_f, s_s).p, 1 - observation(pomdp, a_f, s_s).p]
+    O[a_f, s_h, :] = [observation(pomdp, a_f, s_h).p, 1 - observation(pomdp, a_f, s_h).p]
+    O[a_i, s_s, :] = [observation(pomdp, a_i, s_s).p, 1 - observation(pomdp, a_i, s_s).p]
+    O[a_i, s_h, :] = [observation(pomdp, a_i, s_h).p, 1 - observation(pomdp, a_i, s_h).p]
+    O[a_s, s_s, :] = [observation(pomdp, a_s, s_s).p, 1 - observation(pomdp, a_s, s_s).p]
+    O[a_s, s_h, :] = [observation(pomdp, a_s, s_h).p, 1 - observation(pomdp, a_s, s_h).p]
+
+    return POMDP(DiscretePOMDP(T, R, O, γ))
+end
diff --git a/src/pomdp/discrete_pomdp.jl b/src/pomdp/discrete_pomdp.jl
@@ -21,7 +21,7 @@ reward(pomdp::DiscretePOMDP, s::Int, a::Int) = pomdp.R[s,a]
 
 reward(pomdp::DiscretePOMDP, b::Vector{Float64}, a::Int) = sum(reward(pomdp,s,a)*b[s] for s in ordered_states(pomdp))
 
-function POMDP(pomdp; γ::Float64=discount(pomdp))
+function POMDP(pomdp::DiscretePOMDP; γ::Float64=discount(pomdp))
     return POMDP(
         γ,
         ordered_states(pomdp),
diff --git a/src/pomdp/machine_replacement.jl b/src/pomdp/machine_replacement.jl
@@ -90,6 +90,78 @@ function DiscretePOMDP(pomdp::MachineReplacement; γ::Float64=pomdp.γ)
     return DiscretePOMDP(T, R, O, γ)
 end
 
+function POMDP(pomdp::MachineReplacement; γ::Float64=pomdp.γ)
+    T = Array{Float64}(undef, 3, 4, 3)
+    R = Array{Float64}(undef, 3, 4)
+    O = Array{Float64}(undef, 4, 3, 2)
+
+    s_0 = 1 # none broken
+    s_1 = 2 # one broken
+    s_2 = 3 # two broken
+
+    a_m = 1 # manufacture
+    a_e = 2 # manufacture + examine
+    a_i = 3 # interrupt the line, inspect components, replace failed components
+    a_r = 4 # interrupt the line, replace both components
+
+    o_n = 1 # nondefective
+    o_d = 2 # defective
+
+    T[s_0, a_m, :] = [0.81, 0.18, 0.01] # 10% independent chance of part breaking
+    T[s_0, a_e, :] = [0.81, 0.18, 0.01]
+    T[s_0, a_i, :] = [1.00, 0.00, 0.00]
+    T[s_0, a_r, :] = [1.00, 0.00, 0.00]
+    T[s_1, a_m, :] = [0.00, 0.90, 0.10] # 10% chance of remaining part breaking
+    T[s_1, a_e, :] = [0.00, 0.90, 0.10]
+    T[s_1, a_i, :] = [1.00, 0.00, 0.00]
+    T[s_1, a_r, :] = [1.00, 0.00, 0.00]
+    T[s_2, a_m, :] = [0.00, 0.00, 1.00] # stay broken
+    T[s_2, a_e, :] = [0.00, 0.00, 1.00]
+    T[s_2, a_i, :] = [1.00, 0.00, 0.00]
+    T[s_2, a_r, :] = [1.00, 0.00, 0.00]
+
+    # There is a profit of 1 for producing a nondefective product.
+    # Thus, the expected profit for beginning with 0, 1, or 2 defective parts is
+    # 0.9025, 0.475, and 0.25, respectively.
+    # Examining the finished product costs 0.25.
+    # The inspect action incurs a 0.5 penalty plus replacement cost for each unit of 1.
+    # The straight-up replacement action has no inspection cost but does incur a 2 unit cost.
+    r_examine = -0.25
+    r_inspect = -0.5
+    r_replace = -2.0
+
+    R[s_0, a_m] = 0.9025
+    R[s_1, a_m] = 0.475
+    R[s_2, a_m] = 0.25
+    R[s_0, a_e] = 0.9025 + r_examine
+    R[s_1, a_e] = 0.475 + r_examine
+    R[s_2, a_e] = 0.25 + r_examine
+    R[s_0, a_i] = r_inspect
+    R[s_1, a_i] = r_inspect - 1.0 # replace 1
+    R[s_2, a_i] = r_inspect - 2.0 # replace 2
+    R[s_0, a_r] = r_replace
+    R[s_1, a_r] = r_replace
+    R[s_2, a_r] = r_replace
+
+    # Probabilities of observing a nondefective product are 1.0, 0.5, and 0.25 if
+    # there are 0, 1, or 2 faulty internal components.
+    # If we don't examine, we always observe nondefective.
+    O[a_m, s_0, :] = [1.00, 0.00]
+    O[a_m, s_1, :] = [1.00, 0.00]
+    O[a_m, s_2, :] = [1.00, 0.00]
+    O[a_e, s_0, :] = [1.00, 0.00]
+    O[a_e, s_1, :] = [0.50, 0.50]
+    O[a_e, s_2, :] = [0.25, 0.75]
+    O[a_i, s_0, :] = [1.00, 0.00]
+    O[a_i, s_1, :] = [1.00, 0.00]
+    O[a_i, s_2, :] = [1.00, 0.00]
+    O[a_r, s_0, :] = [1.00, 0.00]
+    O[a_r, s_1, :] = [1.00, 0.00]
+    O[a_r, s_2, :] = [1.00, 0.00]
+
+    return POMDP(DiscretePOMDP(T, R, O, γ))
+end
+
 # MachineReplacement = generate_machine_replacement_pomdp(1.0)
 
 MACHINE_REPLACEMENT_ACTION_COLORS = Dict(
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -55,6 +55,7 @@ end
     @test p.generate_sr(m, state, action)[1] in p.ordered_states(m) && p.generate_sr(m, state, action)[2] <= 10
     @test p.generate_start_state(m) in p.ordered_states(m)
     @test p.hex_distance(rand(hexes), rand(hexes)) >= 0
+    mdp = p.DiscreteMDP(m)
 end
 @testset "simple_lqr.jl" begin
     m = p.LqrMDP()
@@ -80,7 +81,8 @@ end
 
 
 @testset "crying_baby.jl" begin
-    m = p.BabyPOMDP(-10.0, -5.0, -0.5, 0.1, 0.8, 0.1, 0.9, 0.9)
+    # m = p.BabyPOMDP(-10.0, -5.0, -0.5, 0.1, 0.8, 0.1, 0.9, 0.9)
+    m = p.BabyPOMDP()
     @test p.n_states(m) == 2 && p.ordered_states(m) == [1, 2]
     @test p.n_actions(m) == 3 && p.ordered_actions(m) == [1, 2, 3]
     @test p.n_observations(m) == 2 && p.ordered_observations(m) == [true, false]
@@ -89,12 +91,13 @@ end
     @test 0 <= p.observation(m, rand(1:3), rand(1:2)).p <= 1
     @test p.reward(m, rand(1:2), rand(1:3)) <= 0
     @test p.reward(m, [0.1, 0.9], rand(1:3)) <= 0
+    pomdp = p.POMDP(m)
 end
 
 @testset "machine_replacement.jl" begin
     # m = p.generate_machine_replacement_pomdp(1.0)
     mdp = p.MachineReplacement()
-    m = p.MachineReplacement(mdp)
+    m = p.DiscretePOMDP(mdp)
     @test p.n_states(m) == 3 && p.ordered_states(m) == 1:3
     @test p.n_actions(m) == 4 && p.ordered_actions(m) == 1:4
     @test p.n_observations(m) == 2 && p.ordered_observations(m) == 1:2
@@ -108,7 +111,7 @@ end
 @testset "catch.jl" begin
     # m = p.generate_catch_pomdp(0.9)
     mdp = p.Catch()
-    m = p.DiscreteMDP(mdp)
+    m = p.DiscretePOMDP(mdp)
     @test p.n_states(m) == 4 && p.ordered_states(m) == 1:4
     @test p.n_actions(m) == 10 && p.ordered_actions(m) == 1:10
     @test p.n_observations(m) == 2 && p.ordered_observations(m) == 1:2