1+ #
2+ # This file is hardly inspired by the IRL implementation of:
3+ # https://github.com/reinforcement-learning-kr/lets-do-irl/tree/master/mountaincar/maxent
4+ # It is a class type implementation restructured for our use case.
5+ #
6+
17import gym
28import numpy as np
3- import matplotlib .pyplot as plt
49
510
611class MountainCar :
712
8- def __init__ (self , animation , feature_matrix , one_feature , q_learning_rate , gamma , n_states , trainer ):
13+ def __init__ (self , animation , one_feature ):
914 if animation :
1015 self .env = gym .make ('MountainCar-v0' , render_mode = "human" )
1116 else :
1217 self .env = gym .make ('MountainCar-v0' )
13- self .feature_matrix = feature_matrix
1418 self .one_feature = one_feature
15- self .q_table = None
16- self .q_learning_rate = q_learning_rate
17- self .gamma = gamma
18- self .n_states = n_states
19- self .trainer = trainer
20-
21- def set_q_table (self , table ):
22- """
23- Sets the
24- :param table:
25- :return:
26- """
27- self .q_table = table
2819
29- def get_demonstrations (self , one_feature ):
20+ def get_demonstrations (self ):
3021 """
3122 Parses the demonstrations and returns the demonstrations.
3223 :param one_feature:
@@ -42,7 +33,7 @@ def get_demonstrations(self, one_feature):
4233 for y in range (len (raw_demo [0 ])):
4334 position_idx = int ((raw_demo [x ][y ][0 ] - env_low [0 ]) / env_distance [0 ])
4435 velocity_idx = int ((raw_demo [x ][y ][1 ] - env_low [1 ]) / env_distance [1 ])
45- state_idx = position_idx + velocity_idx * one_feature
36+ state_idx = position_idx + velocity_idx * self . one_feature
4637
4738 demonstrations [x ][y ][0 ] = state_idx
4839 demonstrations [x ][y ][1 ] = raw_demo [x ][y ][2 ]
@@ -64,18 +55,11 @@ def idx_to_state(self, state):
6455 state_idx = position_idx + velocity_idx * self .one_feature
6556 return state_idx
6657
67- def update_q_table (self , state , action , reward , next_state ):
68- """
69- Updates the Q table for a specified state and action.
70- :param state:
71- :param action:
72- :param reward:
73- :param next_state:
74- :return:
75- """
76- q_1 = self .q_table [state ][action ]
77- q_2 = reward + self .gamma * max (self .q_table [next_state ])
78- self .q_table [state ][action ] += self .q_learning_rate * (q_2 - q_1 )
58+ def env_action_space (self ):
59+ return self .env .action_space
60+
61+ def env_observation_space (self ):
62+ return self .env .observation_space
7963
8064 def env_render (self ):
8165 """
@@ -99,88 +83,3 @@ def env_step(self, action):
9983 :return:
10084 """
10185 return self .env .step (action )
102-
103- def train (self , theta_learning_rate ):
104- """
105- Trains a model.
106- :param theta_learning_rate:
107- :return:
108- """
109- demonstrations = self .get_demonstrations (self .one_feature )
110-
111- # Get expert feature expectations
112- expert = self .trainer .expert_feature_expectations (demonstrations )
113-
114- # Learning
115- learner_feature_expectations = np .zeros (self .n_states )
116- episodes , scores = [], []
117- # For every episode
118- for episode in range (30000 ):
119- # Resets the environment to an initial state and returns the initial observation.
120- # Start position is in random range of [-0.6, -0.4]
121- state = self .env_reset ()
122- score = 0
123-
124- # Mini-Batches ?
125- if (episode != 0 and episode == 10000 ) or (episode > 10000 and episode % 5000 == 0 ):
126- learner = learner_feature_expectations / episode
127- self .trainer .maxent_irl (expert , learner , theta_learning_rate )
128-
129- # One Step in environment
130- state = state [0 ]
131- while True :
132- state_idx = self .idx_to_state (state )
133- action = np .argmax (self .q_table [state_idx ])
134- # Run one timestep of the environment's dynamics.
135- next_state , reward , done , _ , _ = self .env_step (action )
136-
137- irl_reward = self .trainer .get_reward (self .n_states , state_idx )
138- next_state_idx = self .idx_to_state (next_state )
139- self .update_q_table (state_idx , action , irl_reward , next_state_idx )
140-
141- learner_feature_expectations += self .trainer .get_feature_matrix ()[int (state_idx )]
142-
143- score += reward
144- state = next_state
145- if done :
146- scores .append (score )
147- episodes .append (episode )
148- break
149-
150- if episode % 1000 == 0 :
151- score_avg = np .mean (scores )
152- print ('{} episode score is {:.2f}' .format (episode , score_avg ))
153- plt .plot (episodes , scores , 'b' )
154- plt .savefig ("./learning_curves/maxent_30000.png" )
155- np .save ("./results/maxent_30000_table" , arr = self .q_table )
156-
157- def test (self ):
158- """
159- Tests the previous trained model
160- :return:
161- """
162- episodes , scores = [], []
163-
164- for episode in range (10 ):
165- state = self .env_reset ()
166- score = 0
167-
168- state = state [0 ]
169- while True :
170- self .env_render ()
171- state_idx = self .idx_to_state (state )
172- action = np .argmax (self .q_table [state_idx ])
173- next_state , reward , done , _ , _ = self .env_step (action )
174-
175- score += reward
176- state = next_state
177-
178- if done :
179- scores .append (score )
180- episodes .append (episode )
181- plt .plot (episodes , scores , 'b' )
182- plt .savefig ("./learning_curves/maxent_test_30000.png" )
183- break
184-
185- if episode % 1 == 0 :
186- print ('{} episode score is {:.2f}' .format (episode , score ))
0 commit comments