2323import argparse
2424import gym
2525import matplotlib .pyplot as plt
26- import numpy as np
2726import logging
2827import numpy as np
2928import sys
3029
31- from MountainCar import MountainCar
32- from MaxEntropyIRL import MaxEntropyIRL
30+ from . MountainCar import MountainCar
31+ from . MaxEntropyIRL import MaxEntropyIRL
3332
3433# from irlwpytorch import __version__
3534
3938
4039_logger = logging .getLogger (__name__ )
4140
42- n_states = 400 # position - 20, velocity - 20
43- n_actions = 3
44- one_feature = 20 # number of state per one feature
45- q_table = np .zeros ((n_states , n_actions )) # (400, 3)
46- feature_matrix = np .eye ((n_states )) # (400, 400)
47-
48- gamma = 0.99
49- q_learning_rate = 0.03
50- theta_learning_rate = 0.05
51-
5241np .random .seed (1 )
5342
5443
55- def idx_demo (env , one_feature ):
56- env_low = env .observation_space .low
57- env_high = env .observation_space .high
58- env_distance = (env_high - env_low ) / one_feature
59-
60- raw_demo = np .load (file = "expert_demo/expert_demo.npy" )
61- demonstrations = np .zeros ((len (raw_demo ), len (raw_demo [0 ]), 3 ))
62-
63- for x in range (len (raw_demo )):
64- for y in range (len (raw_demo [0 ])):
65- position_idx = int ((raw_demo [x ][y ][0 ] - env_low [0 ]) / env_distance [0 ])
66- velocity_idx = int ((raw_demo [x ][y ][1 ] - env_low [1 ]) / env_distance [1 ])
67- state_idx = position_idx + velocity_idx * one_feature
68-
69- demonstrations [x ][y ][0 ] = state_idx
70- demonstrations [x ][y ][1 ] = raw_demo [x ][y ][2 ]
71-
72- return demonstrations
73-
74-
75- def idx_state (env , state ):
76- env_low = env .observation_space .low
77- env_high = env .observation_space .high
78- env_distance = (env_high - env_low ) / one_feature
79- position_idx = int ((state [0 ] - env_low [0 ]) / env_distance [0 ])
80- velocity_idx = int ((state [1 ] - env_low [1 ]) / env_distance [1 ])
81- state_idx = position_idx + velocity_idx * one_feature
82- return state_idx
83-
84-
85- def update_q_table (state , action , reward , next_state ):
86- q_1 = q_table [state ][action ]
87- q_2 = reward + gamma * max (q_table [next_state ])
88- q_table [state ][action ] += q_learning_rate * (q_2 - q_1 )
89-
90-
91- q_table = np .load (file = "results/maxent_q_table.npy" ) # (400, 3)
92- one_feature = 20 # number of state per one feature
93-
94-
95- def idx_to_state (env , state ):
96- """ Convert pos and vel about mounting car environment to the integer value"""
97- env_low = env .observation_space .low
98- env_high = env .observation_space .high
99- env_distance = (env_high - env_low ) / one_feature
100- position_idx = int ((state [0 ] - env_low [0 ]) / env_distance [0 ])
101- velocity_idx = int ((state [1 ] - env_low [1 ]) / env_distance [1 ])
102- state_idx = position_idx + velocity_idx * one_feature
103- return state_idx
104-
105-
10644def parse_args (args ):
10745 """Parse command line parameters
10846
@@ -119,6 +57,10 @@ def parse_args(args):
11957 action = "version" ,
12058 # version=f"IRLwPytorch {__version__}",
12159 )
60+ parser .add_argument ('--training' , action = 'store_true' , help = "Enables training of model." )
61+ parser .add_argument ('--testing' , action = 'store_true' ,
62+ help = "Enables testing of previously created model." )
63+ parser .add_argument ('--render' , action = 'store_true' , help = "Enables visualization of mountaincar." )
12264 return parser .parse_args (args )
12365
12466
@@ -147,36 +89,51 @@ def main(args):
14789 args = parse_args (args )
14890 _logger .debug ("Starting crazy calculations..." )
14991
150- car = MountainCar ()
92+ n_states = 400 # position - 20, velocity - 20
93+ n_actions = 3
94+ one_feature = 20 # number of state per one feature
95+ feature_matrix = np .eye ((n_states )) # (400, 400)
96+
97+ gamma = 0.99
98+ q_learning_rate = 0.03
99+ theta_learning_rate = 0.05
100+
101+ car = None
102+ if args .render :
103+ car = MountainCar (True , feature_matrix , one_feature , q_learning_rate , gamma )
104+ else :
105+ car = MountainCar (False , feature_matrix , one_feature , q_learning_rate , gamma )
151106
152107 theta = - (np .random .uniform (size = (n_states ,)))
153108 trainer = MaxEntropyIRL (feature_matrix , theta )
154109
155- if False :
156- env = gym .make ('MountainCar-v0' , render_mode = "human" )
157- demonstrations = idx_demo (env , one_feature )
110+ if args .training :
111+ q_table = np .zeros ((n_states , n_actions )) # (400, 3)
112+ car .set_q_table (q_table )
113+
114+ demonstrations = car .idx_demo (one_feature )
158115
159116 expert = trainer .expert_feature_expectations (demonstrations )
160117 learner_feature_expectations = np .zeros (n_states )
161118 episodes , scores = [], []
162119
163- for episode in range (300 ):
164- state = env . reset ()
120+ for episode in range (30000 ):
121+ state = car . env_reset ()
165122 score = 0
166123
167- if (episode != 0 and episode == 100 ) or (episode > 100 and episode % 50 == 0 ):
124+ if (episode != 0 and episode == 10000 ) or (episode > 10000 and episode % 5000 == 0 ):
168125 learner = learner_feature_expectations / episode
169126 trainer .maxent_irl (expert , learner , theta_learning_rate )
170127
171128 state = state [0 ]
172129 while True :
173- state_idx = idx_state (env , state )
130+ state_idx = car . idx_state (state )
174131 action = np .argmax (q_table [state_idx ])
175- next_state , reward , done , _ , _ = env . step (action )
132+ next_state , reward , done , _ , _ = car . env_step (action )
176133
177134 irl_reward = trainer .get_reward (n_states , state_idx )
178- next_state_idx = idx_state (env , next_state )
179- update_q_table (state_idx , action , irl_reward , next_state_idx )
135+ next_state_idx = car . idx_state (next_state )
136+ car . update_q_table (state_idx , action , irl_reward , next_state_idx )
180137
181138 learner_feature_expectations += trainer .get_feature_matrix ()[int (state_idx )]
182139
@@ -187,28 +144,29 @@ def main(args):
187144 episodes .append (episode )
188145 break
189146
190- if episode % 10 == 0 :
147+ if episode % 100 == 0 :
191148 score_avg = np .mean (scores )
192149 print ('{} episode score is {:.2f}' .format (episode , score_avg ))
193150 plt .plot (episodes , scores , 'b' )
194- plt .savefig ("./learning_curves/maxent_300 .png" )
195- np .save ("./results/maxent_300_table " , arr = q_table )
151+ plt .savefig ("./learning_curves/maxent_30000 .png" )
152+ np .save ("./results/maxent_30000_table " , arr = q_table )
196153
197- else :
198- env = gym .make ('MountainCar-v0' , render_mode = "human" )
154+ if args .testing :
155+ q_table = np .load (file = "results/maxent_q_table.npy" ) # (400, 3)
156+ car .set_q_table (q_table )
199157
200158 episodes , scores = [], []
201159
202160 for episode in range (10 ):
203- state = env . reset ()
161+ state = car . env_reset ()
204162 score = 0
205163
206164 state = state [0 ]
207165 while True :
208- env . render ()
209- state_idx = idx_to_state (env , state )
166+ car . env_render ()
167+ state_idx = car . idx_to_state (state )
210168 action = np .argmax (q_table [state_idx ])
211- next_state , reward , done , _ , _ = env . step (action )
169+ next_state , reward , done , _ , _ = car . env_step (action )
212170
213171 score += reward
214172 state = next_state
@@ -217,7 +175,7 @@ def main(args):
217175 scores .append (score )
218176 episodes .append (episode )
219177 plt .plot (episodes , scores , 'b' )
220- plt .savefig ("./learning_curves/maxent_test_300 .png" )
178+ plt .savefig ("./learning_curves/maxent_test_30000 .png" )
221179 break
222180
223181 if episode % 1 == 0 :
@@ -235,14 +193,4 @@ def run():
235193
236194
237195if __name__ == "__main__" :
238- # ^ This is a guard statement that will prevent the following code from
239- # being executed in the case someone imports this file instead of
240- # executing it as a script.
241- # https://docs.python.org/3/library/__main__.html
242-
243- # After installing your project with pip, users can also run your Python
244- # modules as scripts via the ``-m`` flag, as defined in PEP 338::
245- #
246- # python -m irlwpytorch.skeleton 42
247- #
248196 run ()
0 commit comments