refactor: MaxEntropyIRL and MountainCar (#4)

HokageM · web-flow · commit 2737c7881ae0 · 2023-11-10T11:18:41.000+01:00
* refactor MaxEntropyIRL and MountainCar

* add logo

* adjust header
diff --git a/README.md b/README.md
@@ -1,9 +1,22 @@
 # IRLwPython
+
+<img src="logo/IRLwPython.jpg" width="200">
+
 Inverse Reinforcement Learning Algorithm implementation with python.
 
-The implementation is based on: https://github.com/reinforcement-learning-kr/lets-do-irl
+Implemented Algorithms:
+- Maximum Entropy IRL
+- Maximum Entropy Deep IRL
+
+Experiment:
+- Mountaincar: [gym](https://www.gymlibrary.dev/environments/classic_control/mountain_car/)
+
+The implementation of MaxEntropyIRL and MountainCar is based on the implementation of: 
+[lets-do-irl](https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent)
+
+# References
 
-Mountaincar experiment from: https://www.gymlibrary.dev/environments/classic_control/mountain_car/
+...
 
 # Installation
 
diff --git a/logo/IRLwPython.jpg b/logo/IRLwPython.jpg
diff --git a/src/irlwpython/MaxEntropyIRL.py b/src/irlwpython/MaxEntropyIRL.py
@@ -1,10 +1,23 @@
+#
+# This file is hardly inspired by the IRL implementation of:
+# https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
+# It is a class type implementation restructured for our use case.
+#
+
 import numpy as np
+import matplotlib.pyplot as plt
 
 
 class MaxEntropyIRL:
-    def __init__(self, feature_matrix, theta):
+    def __init__(self, target, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta):
+        self.target = target
         self.feature_matrix = feature_matrix
+        self.one_feature = one_feature
+        self.q_table = q_table
+        self.q_learning_rate = q_learning_rate
         self.theta = theta
+        self.gamma = gamma
+        self.n_states = n_states
 
     def get_feature_matrix(self):
         """
@@ -53,3 +66,102 @@ def maxent_irl(self, expert, learner, learning_rate):
         for j in range(len(self.theta)):
             if self.theta[j] > 0:
                 self.theta[j] = 0
+
+    def update_q_table(self, state, action, reward, next_state):
+        """
+        Updates the Q table for a specified state and action.
+        :param state:
+        :param action:
+        :param reward:
+        :param next_state:
+        :return:
+        """
+        q_1 = self.q_table[state][action]
+        q_2 = reward + self.gamma * max(self.q_table[next_state])
+        self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
+
+    def train(self, theta_learning_rate):
+        """
+        Trains a model.
+        :param theta_learning_rate:
+        :return:
+        """
+        demonstrations = self.target.get_demonstrations()
+
+        # Get expert feature expectations
+        expert = self.expert_feature_expectations(demonstrations)
+
+        # Learning
+        learner_feature_expectations = np.zeros(self.n_states)
+        episodes, scores = [], []
+        # For every episode
+        for episode in range(30000):
+            # Resets the environment to an initial state and returns the initial observation.
+            # Start position is in random range of [-0.6, -0.4]
+            state = self.target.env_reset()
+            score = 0
+
+            # Mini-Batches ?
+            if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
+                learner = learner_feature_expectations / episode
+                self.maxent_irl(expert, learner, theta_learning_rate)
+
+            # One Step in environment
+            state = state[0]
+            while True:
+                state_idx = self.target.idx_to_state(state)
+                action = np.argmax(self.q_table[state_idx])
+
+                # Run one timestep of the environment's dynamics.
+                next_state, reward, done, _, _ = self.target.env_step(action)
+
+                irl_reward = self.get_reward(self.n_states, state_idx)
+                next_state_idx = self.target.idx_to_state(next_state)
+                self.update_q_table(state_idx, action, irl_reward, next_state_idx)
+
+                learner_feature_expectations += self.get_feature_matrix()[int(state_idx)]
+
+                score += reward
+                state = next_state
+                if done:
+                    scores.append(score)
+                    episodes.append(episode)
+                    break
+
+            if episode % 1000 == 0:
+                score_avg = np.mean(scores)
+                print('{} episode score is {:.2f}'.format(episode, score_avg))
+                plt.plot(episodes, scores, 'b')
+                plt.savefig("./learning_curves/maxent_30000.png")
+                np.save("./results/maxent_30000_table", arr=self.q_table)
+
+    def test(self):
+        """
+        Tests the previous trained model
+        :return:
+        """
+        episodes, scores = [], []
+
+        for episode in range(10):
+            state = self.target.env_reset()
+            score = 0
+
+            state = state[0]
+            while True:
+                self.target.env_render()
+                state_idx = self.target.idx_to_state(state)
+                action = np.argmax(self.q_table[state_idx])
+                next_state, reward, done, _, _ = self.target.env_step(action)
+
+                score += reward
+                state = next_state
+
+                if done:
+                    scores.append(score)
+                    episodes.append(episode)
+                    plt.plot(episodes, scores, 'b')
+                    plt.savefig("./learning_curves/maxent_test_30000.png")
+                    break
+
+            if episode % 1 == 0:
+                print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/MountainCar.py b/src/irlwpython/MountainCar.py
@@ -1,32 +1,23 @@
+#
+# This file is hardly inspired by the IRL implementation of:
+# https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
+# It is a class type implementation restructured for our use case.
+#
+
 import gym
 import numpy as np
-import matplotlib.pyplot as plt
 
 
 class MountainCar:
 
-    def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer):
+    def __init__(self, animation, one_feature):
         if animation:
             self.env = gym.make('MountainCar-v0', render_mode="human")
         else:
             self.env = gym.make('MountainCar-v0')
-        self.feature_matrix = feature_matrix
         self.one_feature = one_feature
-        self.q_table = None
-        self.q_learning_rate = q_learning_rate
-        self.gamma = gamma
-        self.n_states = n_states
-        self.trainer = trainer
-
-    def set_q_table(self, table):
-        """
-        Sets the
-        :param table:
-        :return:
-        """
-        self.q_table = table
 
-    def get_demonstrations(self, one_feature):
+    def get_demonstrations(self):
         """
         Parses the demonstrations and returns the demonstrations.
         :param one_feature:
@@ -42,7 +33,7 @@ def get_demonstrations(self, one_feature):
             for y in range(len(raw_demo[0])):
                 position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
                 velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
-                state_idx = position_idx + velocity_idx * one_feature
+                state_idx = position_idx + velocity_idx * self.one_feature
 
                 demonstrations[x][y][0] = state_idx
                 demonstrations[x][y][1] = raw_demo[x][y][2]
@@ -64,18 +55,11 @@ def idx_to_state(self, state):
         state_idx = position_idx + velocity_idx * self.one_feature
         return state_idx
 
-    def update_q_table(self, state, action, reward, next_state):
-        """
-        Updates the Q table for a specified state and action.
-        :param state:
-        :param action:
-        :param reward:
-        :param next_state:
-        :return:
-        """
-        q_1 = self.q_table[state][action]
-        q_2 = reward + self.gamma * max(self.q_table[next_state])
-        self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
+    def env_action_space(self):
+        return self.env.action_space
+
+    def env_observation_space(self):
+        return self.env.observation_space
 
     def env_render(self):
         """
@@ -99,88 +83,3 @@ def env_step(self, action):
         :return:
         """
         return self.env.step(action)
-
-    def train(self, theta_learning_rate):
-        """
-        Trains a model.
-        :param theta_learning_rate:
-        :return:
-        """
-        demonstrations = self.get_demonstrations(self.one_feature)
-
-        # Get expert feature expectations
-        expert = self.trainer.expert_feature_expectations(demonstrations)
-
-        # Learning
-        learner_feature_expectations = np.zeros(self.n_states)
-        episodes, scores = [], []
-        # For every episode
-        for episode in range(30000):
-            # Resets the environment to an initial state and returns the initial observation.
-            # Start position is in random range of [-0.6, -0.4]
-            state = self.env_reset()
-            score = 0
-
-            # Mini-Batches ?
-            if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
-                learner = learner_feature_expectations / episode
-                self.trainer.maxent_irl(expert, learner, theta_learning_rate)
-
-            # One Step in environment
-            state = state[0]
-            while True:
-                state_idx = self.idx_to_state(state)
-                action = np.argmax(self.q_table[state_idx])
-                # Run one timestep of the environment's dynamics.
-                next_state, reward, done, _, _ = self.env_step(action)
-
-                irl_reward = self.trainer.get_reward(self.n_states, state_idx)
-                next_state_idx = self.idx_to_state(next_state)
-                self.update_q_table(state_idx, action, irl_reward, next_state_idx)
-
-                learner_feature_expectations += self.trainer.get_feature_matrix()[int(state_idx)]
-
-                score += reward
-                state = next_state
-                if done:
-                    scores.append(score)
-                    episodes.append(episode)
-                    break
-
-            if episode % 1000 == 0:
-                score_avg = np.mean(scores)
-                print('{} episode score is {:.2f}'.format(episode, score_avg))
-                plt.plot(episodes, scores, 'b')
-                plt.savefig("./learning_curves/maxent_30000.png")
-                np.save("./results/maxent_30000_table", arr=self.q_table)
-
-    def test(self):
-        """
-        Tests the previous trained model
-        :return:
-        """
-        episodes, scores = [], []
-
-        for episode in range(10):
-            state = self.env_reset()
-            score = 0
-
-            state = state[0]
-            while True:
-                self.env_render()
-                state_idx = self.idx_to_state(state)
-                action = np.argmax(self.q_table[state_idx])
-                next_state, reward, done, _, _ = self.env_step(action)
-
-                score += reward
-                state = next_state
-
-                if done:
-                    scores.append(score)
-                    episodes.append(episode)
-                    plt.plot(episodes, scores, 'b')
-                    plt.savefig("./learning_curves/maxent_test_30000.png")
-                    break
-
-            if episode % 1 == 0:
-                print('{} episode score is {:.2f}'.format(episode, score))
diff --git a/src/irlwpython/main.py b/src/irlwpython/main.py
@@ -5,6 +5,7 @@
 
 from MountainCar import MountainCar
 from MaxEntropyIRL import MaxEntropyIRL
+from MaxEntropyDeepIRL import MaxEntropyDeepIRL
 
 #from irlwpython import __version__
 
@@ -33,6 +34,7 @@ def parse_args(args):
         action="version",
        # version=f"IRLwPython {__version__}",
     )
+    parser.add_argument('--deep', action='store_true', help="Uses Max Entropy Deep IRL.")
     parser.add_argument('--training', action='store_true', help="Enables training of model.")
     parser.add_argument('--testing', action='store_true',
                         help="Enables testing of previously created model.")
@@ -75,24 +77,25 @@ def main(args):
     theta_learning_rate = 0.05
 
     theta = -(np.random.uniform(size=(n_states,)))
-    trainer = MaxEntropyIRL(feature_matrix, theta)
 
     if args.render:
-        car = MountainCar(True, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
+        car = MountainCar(True, one_feature)
     else:
-        car = MountainCar(False, feature_matrix, one_feature, q_learning_rate, gamma, n_states, trainer)
+        car = MountainCar(False, one_feature)
+
+    #if args.deep:
+    #    deep = MaxEntropyDeepIRL()
+    #    deep.run()
 
     if args.training:
         q_table = np.zeros((n_states, n_actions))
-        car.set_q_table(q_table)
-
-        car.train(theta_learning_rate)
+        trainer = MaxEntropyIRL(car, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta)
+        trainer.train(theta_learning_rate)
 
     if args.testing:
         q_table = np.load(file="./results/maxent_q_table.npy")  # (400, 3)
-        car.set_q_table(q_table)
-
-        car.test()
+        trainer = MaxEntropyIRL(car, feature_matrix, one_feature, q_table, q_learning_rate, gamma, n_states, theta)
+        trainer.test()
 
     _logger.info("Script ends here")