From e49c4b58eb6bbb01c4f87fab8cb11ca86d4ca213 Mon Sep 17 00:00:00 2001
From: Wyatt Lansford <wyatt.lansford@heronsystems.com>
Date: Mon, 28 Sep 2020 13:41:39 -0400
Subject: [PATCH 1/4] potential based rewards base class

---
 adept/rewards/__init__.py                     |  0
 adept/rewards/base/__init__.py                |  0
 .../base/base_potential_based_rewards.py      | 83 +++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 adept/rewards/__init__.py
 create mode 100644 adept/rewards/base/__init__.py
 create mode 100644 adept/rewards/base/base_potential_based_rewards.py

diff --git a/adept/rewards/__init__.py b/adept/rewards/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adept/rewards/base/__init__.py b/adept/rewards/base/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py
new file mode 100644
index 0000000..7e18adf
--- /dev/null
+++ b/adept/rewards/base/base_potential_based_rewards.py
@@ -0,0 +1,83 @@
+
+import numpy as np
+import abc
+
+class BasePotentialBasedReward:
+    """
+    Class for applying potential reward shaping to a set of observations. Potential based rewards
+    are used to prevent the learning of suboptimal policies. The reward for executing a transition
+    between states is the difference in value between the potential function applied to each state.
+    This condition is sufficient to guarentee policy invariance
+
+    For details, see
+
+    "Policy invariance under reward transformations:
+    Theory and application to reward shaping"
+    https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf
+
+
+    """
+
+    def __init__(
+        self,
+        name: str,
+        gamma: float,
+        exponent_coefficient: float,
+        minimum: float,
+        maximum: float,
+        absolute: bool,
+        reward_base: float,
+    ) -> None:
+
+        """
+        Parameters
+        ----------
+        name: str
+            Name of the shaped reward
+        gamma : float
+            Discount factor needed for calculating potential-based reward shaping
+        exponential_coefficient : float
+            The coefficient of the exponent value. The smaller the value, the closer to linear
+        minimum : float
+            Minimum value to be given to the agent. This should match the minimum of the gym space
+        maximum : float
+            Maximum value to be given to the agent. This should match the maximum of the gym space
+        absolute : bool
+            If the absolute value should be taken during preprocessing
+        reward_base : float
+            Reward to use for the phi calculations (before the potential--not the actual reward provided)
+        """
+        self._name = name
+        self._gamma = gamma
+        self._exponential_coefficient = exponent_coefficient
+        self._minimum = minimum
+        self._maximum = maximum
+        self._absolute = absolute
+        self._reward_base = reward_base
+
+        self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum
+
+
+    def __call__(self, observation, next_observation, action,) -> float:
+
+        return self._potential_shaping_function(observation, next_observation)
+
+    def name(self) -> str:
+        return f"{type(self).__name__}_{self._name}"
+
+    def _preprocess_absolute(self, x):
+        return np.abs(x) if self._absolute else x
+
+    def _preprocess_observation(self, x):
+        return min(max(self._minimum, x), self._maximum)
+
+    def _potential_shaping_function(self, current_observation, next_observation) -> float:
+       return (self._gamma * self._phi(next_observation)) - self._phi(current_observation)
+
+    @abc.abstractmethod
+    def _phi(self, x) -> float:
+        """
+            Example phi function:
+            return self._reward_base / (1 + np.exp(self._exponent_coefficient * (self._preprocess_observation(x) - self._midpoint)))
+        """
+        raise NotImplementedError
\ No newline at end of file

From 2681d411ffee7822442c9ce4fa83185311f80614 Mon Sep 17 00:00:00 2001
From: Wyatt Lansford <wyatt.lansford@heronsystems.com>
Date: Mon, 28 Sep 2020 13:43:31 -0400
Subject: [PATCH 2/4] fixing spacing

---
 adept/rewards/base/base_potential_based_rewards.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py
index 7e18adf..c0906e4 100644
--- a/adept/rewards/base/base_potential_based_rewards.py
+++ b/adept/rewards/base/base_potential_based_rewards.py
@@ -14,8 +14,6 @@ class BasePotentialBasedReward:
     "Policy invariance under reward transformations:
     Theory and application to reward shaping"
     https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf
-
-
     """
 
     def __init__(
@@ -57,9 +55,7 @@ def __init__(
 
         self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum
 
-
     def __call__(self, observation, next_observation, action,) -> float:
-
         return self._potential_shaping_function(observation, next_observation)
 
     def name(self) -> str:
@@ -72,7 +68,7 @@ def _preprocess_observation(self, x):
         return min(max(self._minimum, x), self._maximum)
 
     def _potential_shaping_function(self, current_observation, next_observation) -> float:
-       return (self._gamma * self._phi(next_observation)) - self._phi(current_observation)
+        return (self._gamma * self._phi(next_observation)) - self._phi(current_observation)
 
     @abc.abstractmethod
     def _phi(self, x) -> float:

From bef315aa20f33f40c1104ca7737d0c25a7866b71 Mon Sep 17 00:00:00 2001
From: Wyatt Lansford <wyatt.lansford@heronsystems.com>
Date: Mon, 28 Sep 2020 13:47:14 -0400
Subject: [PATCH 3/4] removing unused arg

---
 adept/rewards/base/base_potential_based_rewards.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py
index c0906e4..58b13ef 100644
--- a/adept/rewards/base/base_potential_based_rewards.py
+++ b/adept/rewards/base/base_potential_based_rewards.py
@@ -55,7 +55,7 @@ def __init__(
 
         self._midpoint = (self._maximum - self._minimum) / 2 + self._minimum
 
-    def __call__(self, observation, next_observation, action,) -> float:
+    def __call__(self, observation, next_observation) -> float:
         return self._potential_shaping_function(observation, next_observation)
 
     def name(self) -> str:

From 2af1e37681eeab1e462ccae8b7dadd492cb3a4e3 Mon Sep 17 00:00:00 2001
From: Wyatt Lansford <wyatt.lansford@heronsystems.com>
Date: Mon, 28 Sep 2020 13:52:08 -0400
Subject: [PATCH 4/4] adding clarity to comments

---
 adept/rewards/base/base_potential_based_rewards.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/adept/rewards/base/base_potential_based_rewards.py b/adept/rewards/base/base_potential_based_rewards.py
index 58b13ef..7e26529 100644
--- a/adept/rewards/base/base_potential_based_rewards.py
+++ b/adept/rewards/base/base_potential_based_rewards.py
@@ -7,10 +7,12 @@ class BasePotentialBasedReward:
     Class for applying potential reward shaping to a set of observations. Potential based rewards
     are used to prevent the learning of suboptimal policies. The reward for executing a transition
     between states is the difference in value between the potential function applied to each state.
-    This condition is sufficient to guarentee policy invariance
+    This condition is sufficient to guarantee policy invariance.
 
-    For details, see
+    This implementation provides support for potential based reward shaping over scalar observations
+    of length 1.
 
+    For details, see
     "Policy invariance under reward transformations:
     Theory and application to reward shaping"
     https://people.eecs.berkeley.edu/~pabbeel/cs287-fa09/readings/NgHaradaRussell-shaping-ICML1999.pdf