From e49c4b58eb6bbb01c4f87fab8cb11ca86d4ca213 Mon Sep 17 00:00:00 2001 From: Wyatt Lansford Date: Mon, 28 Sep 2020 13:41:39 -0400 Subject: [PATCH 1/4] potential based rewards base class --- adept/rewards/__init__.py | 0 adept/rewards/base/__init__.py | 0 .../base/base_potential_based_rewards.py | 83 +++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 adept/rewards/__init__.py create mode 100644 adept/rewards/base/__init__.py create mode 100644 adept/rewards/base/base_potential_based_rewards.py diff --git a/adept/rewards/__init__.py b/adept/rewards/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/adept/rewards/base/__init__.py b/adept/rewards/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py new file mode 100644 index 0000000..7e18adf --- /dev/null +++ b/adept/rewards/base/base_potential_based_rewards.py @@ -0,0 +1,83 @@ + +import numpy as np +import abc + +class BasePotentialBasedReward: + """ + Class for applying potential reward shaping to a set of observations. Potential based rewards + are used to prevent the learning of suboptimal policies. The reward for executing a transition + between states is the difference in value between the potential function applied to each state. + This condition is sufficient to guarentee policy invariance + + For details, see + + "Policy invariance under reward transformations: + Theory and application to reward shaping" + https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf + + + """ + + def __init__( + self, + name: str, + gamma: float, + exponent_coefficient: float, + minimum: float, + maximum: float, + absolute: bool, + reward_base: float, + ) -> None: + + """ + Parameters + ---------- + name: str + Name of the shaped reward + gamma : float + Discount factor needed for calculating potential-based reward shaping + exponential_coefficient : float + The coefficient of the exponent value. The smaller the value, the closer to linear + minimum : float + Minimum value to be given to the agent. This should match the minimum of the gym space + maximum : float + Maximum value to be given to the agent. This should match the maximum of the gym space + absolute : bool + If the absolute value should be taken during preprocessing + reward_base : float + Reward to use for the phi calculations (before the potential--not the actual reward provided) + """ + self._name = name + self._gamma = gamma + self._exponential_coefficient = exponent_coefficient + self._minimum = minimum + self._maximum = maximum + self._absolute = absolute + self._reward_base = reward_base + + self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum + + + def __call__(self, observation, next_observation, action,) -> float: + + return self._potential_shaping_function(observation, next_observation) + + def name(self) -> str: + return f"{type(self).__name__}_{self._name}" + + def _preprocess_absolute(self, x): + return np.abs(x) if self._absolute else x + + def _preprocess_observation(self, x): + return min(max(self._minimum, x), self._maximum) + + def _potential_shaping_function(self, current_observation, next_observation) -> float: + return (self._gamma * self._phi(next_observation)) - self._phi(current_observation) + + @abc.abstractmethod + def _phi(self, x) -> float: + """ + Example phi function: + return self._reward_base / (1 + np.exp(self._exponent_coefficient * (self._preprocess_observation(x) - self._midpoint))) + """ + raise NotImplementedError \ No newline at end of file From 2681d411ffee7822442c9ce4fa83185311f80614 Mon Sep 17 00:00:00 2001 From: Wyatt Lansford Date: Mon, 28 Sep 2020 13:43:31 -0400 Subject: [PATCH 2/4] fixing spacing --- adept/rewards/base/base_potential_based_rewards.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py index 7e18adf..c0906e4 100644 --- a/adept/rewards/base/base_potential_based_rewards.py +++ b/adept/rewards/base/base_potential_based_rewards.py @@ -14,8 +14,6 @@ class BasePotentialBasedReward: "Policy invariance under reward transformations: Theory and application to reward shaping" https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf - - """ def __init__( @@ -57,9 +55,7 @@ def __init__( self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum - def __call__(self, observation, next_observation, action,) -> float: - return self._potential_shaping_function(observation, next_observation) def name(self) -> str: @@ -72,7 +68,7 @@ def _preprocess_observation(self, x): return min(max(self._minimum, x), self._maximum) def _potential_shaping_function(self, current_observation, next_observation) -> float: - return (self._gamma * self._phi(next_observation)) - self._phi(current_observation) + return (self._gamma * self._phi(next_observation)) - self._phi(current_observation) @abc.abstractmethod def _phi(self, x) -> float: From bef315aa20f33f40c1104ca7737d0c25a7866b71 Mon Sep 17 00:00:00 2001 From: Wyatt Lansford Date: Mon, 28 Sep 2020 13:47:14 -0400 Subject: [PATCH 3/4] removing unused arg --- adept/rewards/base/base_potential_based_rewards.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py index c0906e4..58b13ef 100644 --- a/adept/rewards/base/base_potential_based_rewards.py +++ b/adept/rewards/base/base_potential_based_rewards.py @@ -55,7 +55,7 @@ def __init__( self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum - def __call__(self, observation, next_observation, action,) -> float: + def __call__(self, observation, next_observation) -> float: return self._potential_shaping_function(observation, next_observation) def name(self) -> str: From 2af1e37681eeab1e462ccae8b7dadd492cb3a4e3 Mon Sep 17 00:00:00 2001 From: Wyatt Lansford Date: Mon, 28 Sep 2020 13:52:08 -0400 Subject: [PATCH 4/4] adding clarity to comments --- adept/rewards/base/base_potential_based_rewards.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py index 58b13ef..7e26529 100644 --- a/adept/rewards/base/base_potential_based_rewards.py +++ b/adept/rewards/base/base_potential_based_rewards.py @@ -7,10 +7,12 @@ class BasePotentialBasedReward: Class for applying potential reward shaping to a set of observations. Potential based rewards are used to prevent the learning of suboptimal policies. The reward for executing a transition between states is the difference in value between the potential function applied to each state. - This condition is sufficient to guarentee policy invariance + This condition is sufficient to guarantee policy invariance. - For details, see + This implementation provides support for potential based reward shaping over scalar observations + of length 1. + For details, see "Policy invariance under reward transformations: Theory and application to reward shaping" https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf