HokageM
diff --git a/‎src/irlwpytorch/MaxEntropyIRL.py‎
Lines changed: 22 additions & 6 deletions b/‎src/irlwpytorch/MaxEntropyIRL.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎src/irlwpytorch/MountainCar.py‎
Lines changed: 60 additions & 13 deletions b/‎src/irlwpytorch/MountainCar.py‎
Lines changed: 60 additions & 13 deletions
diff --git a/‎src/irlwpytorch/learning_curves/maxent_30000.png‎
2.61 KB b/‎src/irlwpytorch/learning_curves/maxent_30000.png‎
2.61 KB
diff --git a/‎src/irlwpytorch/learning_curves/maxent_test_30000.png‎
3.73 KB b/‎src/irlwpytorch/learning_curves/maxent_test_30000.png‎
3.73 KB
diff --git a/‎src/irlwpytorch/main.py‎
Lines changed: 2 additions & 2 deletions b/‎src/irlwpytorch/main.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/irlwpytorch/results/maxent_30000_table.npy‎
0 Bytes b/‎src/irlwpytorch/results/maxent_30000_table.npy‎
0 Bytes
@@ -6,20 +6,29 @@ def __init__(self, feature_matrix, theta):
         self.feature_matrix = feature_matrix
         self.theta = theta
 
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
     def get_feature_matrix(self):
+        """
+        Returns the feature matrix.
+        :return:
+        """
         return self.feature_matrix
 
     def get_reward(self, n_states, state_idx):
+        """
+        Returns the achieved reward.
+        :param n_states:
+        :param state_idx:
+        :return:
+        """
         irl_rewards = self.feature_matrix.dot(self.theta).reshape((n_states,))
         return irl_rewards[state_idx]
 
     def expert_feature_expectations(self, demonstrations):
+        """
+        Returns the feature expectations.
+        :param demonstrations:
+        :return:
+        """
         feature_expectations = np.zeros(self.feature_matrix.shape[0])
 
         for demonstration in demonstrations:
@@ -30,6 +39,13 @@ def expert_feature_expectations(self, demonstrations):
         return feature_expectations
 
     def maxent_irl(self, expert, learner, learning_rate):
+        """
+        Max Entropy Learning step.
+        :param expert:
+        :param learner:
+        :param learning_rate:
+        :return:
+        """
         gradient = expert - learner
         self.theta += learning_rate * gradient
 
 
@@ -18,23 +18,26 @@ def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamm
         self.n_states = n_states
         self.trainer = trainer
 
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
     def set_q_table(self, table):
+        """
+        Sets the
+        :param table:
+        :return:
+        """
         self.q_table = table
 
-    def idx_demo(self, one_feature):
+    def get_demonstrations(self, one_feature):
+        """
+        Parses the demonstrations and returns the demonstrations.
+        :param one_feature:
+        :return:
+        """
         env_low = self.env.observation_space.low
         env_high = self.env.observation_space.high
         env_distance = (env_high - env_low) / self.one_feature
 
         raw_demo = np.load(file="expert_demo/expert_demo.npy")
         demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
-
         for x in range(len(raw_demo)):
             for y in range(len(raw_demo[0])):
                 position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
@@ -47,7 +50,12 @@ def idx_demo(self, one_feature):
         return demonstrations
 
     def idx_to_state(self, state):
-        """ Convert pos and vel about mounting car environment to the integer value"""
+        """
+        Converts state (pos, vel) to the integer value using the mountain car environment.
+        :param state:
+        :return:
+        """
+        """ """
         env_low = self.env.observation_space.low
         env_high = self.env.observation_space.high
         env_distance = (env_high - env_low) / self.one_feature
@@ -57,38 +65,73 @@ def idx_to_state(self, state):
         return state_idx
 
     def update_q_table(self, state, action, reward, next_state):
+        """
+        Updates the Q table for a specified state and action.
+        :param state:
+        :param action:
+        :param reward:
+        :param next_state:
+        :return:
+        """
         q_1 = self.q_table[state][action]
         q_2 = reward + self.gamma * max(self.q_table[next_state])
         self.q_table[state][action] += self.q_learning_rate * (q_2 - q_1)
 
     def env_render(self):
+        """
+        Computes the render frames as specified by render_mode attribute during initialization of the environment.
+        :return:
+        """
         self.env.render()
 
     def env_reset(self):
+        """
+        Resets the environment to an initial state and returns the initial observation.
+        Start position is in random range of [-0.6, -0.4].
+        :return:
+        """
         return self.env.reset()
 
     def env_step(self, action):
+        """
+        Runs one timestep of the environment's dynamics.
+        :param action:
+        :return:
+        """
         return self.env.step(action)
 
     def train(self, theta_learning_rate):
-        demonstrations = self.idx_demo(self.one_feature)
-
+        """
+        Trains a model.
+        :param theta_learning_rate:
+        :return:
+        """
+        demonstrations = self.get_demonstrations(self.one_feature)
+
+        # Get expert feature expectations
         expert = self.trainer.expert_feature_expectations(demonstrations)
+
+        # Learning
         learner_feature_expectations = np.zeros(self.n_states)
         episodes, scores = [], []
-
+        # For every episode
         for episode in range(30000):
+            # Resets the environment to an initial state and returns the initial observation.
+            # Start position is in random range of [-0.6, -0.4]
             state = self.env_reset()
             score = 0
 
+            # Mini-Batches ?
             if (episode != 0 and episode == 10000) or (episode > 10000 and episode % 5000 == 0):
                 learner = learner_feature_expectations / episode
                 self.trainer.maxent_irl(expert, learner, theta_learning_rate)
 
+            # One Step in environment
             state = state[0]
             while True:
                 state_idx = self.idx_to_state(state)
                 action = np.argmax(self.q_table[state_idx])
+                # Run one timestep of the environment's dynamics.
                 next_state, reward, done, _, _ = self.env_step(action)
 
                 irl_reward = self.trainer.get_reward(self.n_states, state_idx)
@@ -104,14 +147,18 @@ def train(self, theta_learning_rate):
                     episodes.append(episode)
                     break
 
-            if episode % 100 == 0:
+            if episode % 1000 == 0:
                 score_avg = np.mean(scores)
                 print('{} episode score is {:.2f}'.format(episode, score_avg))
                 plt.plot(episodes, scores, 'b')
                 plt.savefig("./learning_curves/maxent_30000.png")
                 np.save("./results/maxent_30000_table", arr=self.q_table)
 
     def test(self):
+        """
+        Tests the previous trained model
+        :return:
+        """
         episodes, scores = [], []
 
         for episode in range(10):
 
@@ -65,8 +65,8 @@ def main(args):
     args = parse_args(args)
     _logger.debug("Starting crazy calculations...")
 
-    n_states = 400  # position - 20, velocity - 20
-    n_actions = 3
+    n_states = 400  # position - 20, velocity - 20 -> 20*20
+    n_actions = 3  # Accelerate to the left: 0, Don’t accelerate: 1, Accelerate to the right: 2
     one_feature = 20  # number of state per one feature
     feature_matrix = np.eye(n_states)  # (400, 400)