@@ -18,23 +18,26 @@ def __init__(self, animation, feature_matrix, one_feature, q_learning_rate, gamm
1818 self .n_states = n_states
1919 self .trainer = trainer
2020
21- def __enter__ (self ):
22- return self
23-
24- def __exit__ (self , exc_type , exc_val , exc_tb ):
25- pass
26-
2721 def set_q_table (self , table ):
22+ """
23+ Sets the
24+ :param table:
25+ :return:
26+ """
2827 self .q_table = table
2928
30- def idx_demo (self , one_feature ):
29+ def get_demonstrations (self , one_feature ):
30+ """
31+ Parses the demonstrations and returns the demonstrations.
32+ :param one_feature:
33+ :return:
34+ """
3135 env_low = self .env .observation_space .low
3236 env_high = self .env .observation_space .high
3337 env_distance = (env_high - env_low ) / self .one_feature
3438
3539 raw_demo = np .load (file = "expert_demo/expert_demo.npy" )
3640 demonstrations = np .zeros ((len (raw_demo ), len (raw_demo [0 ]), 3 ))
37-
3841 for x in range (len (raw_demo )):
3942 for y in range (len (raw_demo [0 ])):
4043 position_idx = int ((raw_demo [x ][y ][0 ] - env_low [0 ]) / env_distance [0 ])
@@ -47,7 +50,12 @@ def idx_demo(self, one_feature):
4750 return demonstrations
4851
4952 def idx_to_state (self , state ):
50- """ Convert pos and vel about mounting car environment to the integer value"""
53+ """
54+ Converts state (pos, vel) to the integer value using the mountain car environment.
55+ :param state:
56+ :return:
57+ """
58+ """ """
5159 env_low = self .env .observation_space .low
5260 env_high = self .env .observation_space .high
5361 env_distance = (env_high - env_low ) / self .one_feature
@@ -57,38 +65,73 @@ def idx_to_state(self, state):
5765 return state_idx
5866
5967 def update_q_table (self , state , action , reward , next_state ):
68+ """
69+ Updates the Q table for a specified state and action.
70+ :param state:
71+ :param action:
72+ :param reward:
73+ :param next_state:
74+ :return:
75+ """
6076 q_1 = self .q_table [state ][action ]
6177 q_2 = reward + self .gamma * max (self .q_table [next_state ])
6278 self .q_table [state ][action ] += self .q_learning_rate * (q_2 - q_1 )
6379
6480 def env_render (self ):
81+ """
82+ Computes the render frames as specified by render_mode attribute during initialization of the environment.
83+ :return:
84+ """
6585 self .env .render ()
6686
6787 def env_reset (self ):
88+ """
89+ Resets the environment to an initial state and returns the initial observation.
90+ Start position is in random range of [-0.6, -0.4].
91+ :return:
92+ """
6893 return self .env .reset ()
6994
7095 def env_step (self , action ):
96+ """
97+ Runs one timestep of the environment's dynamics.
98+ :param action:
99+ :return:
100+ """
71101 return self .env .step (action )
72102
73103 def train (self , theta_learning_rate ):
74- demonstrations = self .idx_demo (self .one_feature )
75-
104+ """
105+ Trains a model.
106+ :param theta_learning_rate:
107+ :return:
108+ """
109+ demonstrations = self .get_demonstrations (self .one_feature )
110+
111+ # Get expert feature expectations
76112 expert = self .trainer .expert_feature_expectations (demonstrations )
113+
114+ # Learning
77115 learner_feature_expectations = np .zeros (self .n_states )
78116 episodes , scores = [], []
79-
117+ # For every episode
80118 for episode in range (30000 ):
119+ # Resets the environment to an initial state and returns the initial observation.
120+ # Start position is in random range of [-0.6, -0.4]
81121 state = self .env_reset ()
82122 score = 0
83123
124+ # Mini-Batches ?
84125 if (episode != 0 and episode == 10000 ) or (episode > 10000 and episode % 5000 == 0 ):
85126 learner = learner_feature_expectations / episode
86127 self .trainer .maxent_irl (expert , learner , theta_learning_rate )
87128
129+ # One Step in environment
88130 state = state [0 ]
89131 while True :
90132 state_idx = self .idx_to_state (state )
91133 action = np .argmax (self .q_table [state_idx ])
134+ # Run one timestep of the environment's dynamics.
92135 next_state , reward , done , _ , _ = self .env_step (action )
93136
94137 irl_reward = self .trainer .get_reward (self .n_states , state_idx )
@@ -104,14 +147,18 @@ def train(self, theta_learning_rate):
104147 episodes .append (episode )
105148 break
106149
107- if episode % 100 == 0 :
150+ if episode % 1000 == 0 :
108151 score_avg = np .mean (scores )
109152 print ('{} episode score is {:.2f}' .format (episode , score_avg ))
110153 plt .plot (episodes , scores , 'b' )
111154 plt .savefig ("./learning_curves/maxent_30000.png" )
112155 np .save ("./results/maxent_30000_table" , arr = self .q_table )
113156
114157 def test (self ):
158+ """
159+ Tests the previous trained model
160+ :return:
161+ """
115162 episodes , scores = [], []
116163
117164 for episode in range (10 ):
0 commit comments