2222DISCRETE_ACTION_SPACE = [3 , 3 , 3 , 2 ]
2323BUFFER_INIT_SAMPLES = 32
2424NUM_AGENTS = 12
25+ EPSILON = 1e-7
2526
2627
2728def create_policy_mock (
@@ -136,11 +137,112 @@ def test_policy_evaluate(rnn, visual, discrete):
136137 assert run_out ["action" ].shape == (NUM_AGENTS , VECTOR_ACTION_SPACE )
137138
138139
139- def test_normalization ():
140+ def test_large_normalization ():
140141 behavior_spec = mb .setup_test_behavior_specs (
141142 use_discrete = True , use_visual = False , vector_action_space = [2 ], vector_obs_space = 1
142143 )
144+ # Taken from Walker seed 3713 which causes NaN without proper initialization
145+ large_obs1 = [
146+ 1800.00036621 ,
147+ 1799.96972656 ,
148+ 1800.01245117 ,
149+ 1800.07214355 ,
150+ 1800.02758789 ,
151+ 1799.98303223 ,
152+ 1799.88647461 ,
153+ 1799.89575195 ,
154+ 1800.03479004 ,
155+ 1800.14025879 ,
156+ 1800.17675781 ,
157+ 1800.20581055 ,
158+ 1800.33740234 ,
159+ 1800.36450195 ,
160+ 1800.43457031 ,
161+ 1800.45544434 ,
162+ 1800.44604492 ,
163+ 1800.56713867 ,
164+ 1800.73901367 ,
165+ ]
166+ large_obs2 = [
167+ 1799.99975586 ,
168+ 1799.96679688 ,
169+ 1799.92980957 ,
170+ 1799.89550781 ,
171+ 1799.93774414 ,
172+ 1799.95300293 ,
173+ 1799.94067383 ,
174+ 1799.92993164 ,
175+ 1799.84057617 ,
176+ 1799.69873047 ,
177+ 1799.70605469 ,
178+ 1799.82849121 ,
179+ 1799.85095215 ,
180+ 1799.76977539 ,
181+ 1799.78283691 ,
182+ 1799.76708984 ,
183+ 1799.67163086 ,
184+ 1799.59191895 ,
185+ 1799.5135498 ,
186+ 1799.45556641 ,
187+ 1799.3717041 ,
188+ ]
189+ policy = TFPolicy (
190+ 0 ,
191+ behavior_spec ,
192+ TrainerSettings (network_settings = NetworkSettings (normalize = True )),
193+ "testdir" ,
194+ False ,
195+ )
196+ time_horizon = len (large_obs1 )
197+ trajectory = make_fake_trajectory (
198+ length = time_horizon ,
199+ max_step_complete = True ,
200+ observation_shapes = [(1 ,)],
201+ action_space = [2 ],
202+ )
203+ for i in range (time_horizon ):
204+ trajectory .steps [i ].obs [0 ] = np .array ([large_obs1 [i ]], dtype = np .float32 )
205+ trajectory_buffer = trajectory .to_agentbuffer ()
206+ policy .update_normalization (trajectory_buffer ["vector_obs" ])
143207
208+ # Check that the running mean and variance is correct
209+ steps , mean , variance = policy .sess .run (
210+ [policy .normalization_steps , policy .running_mean , policy .running_variance ]
211+ )
212+ assert mean [0 ] == pytest .approx (np .mean (large_obs1 , dtype = np .float32 ), abs = 0.01 )
213+ assert variance [0 ] / steps == pytest .approx (
214+ np .var (large_obs1 , dtype = np .float32 ), abs = 0.01
215+ )
216+
217+ time_horizon = len (large_obs2 )
218+ trajectory = make_fake_trajectory (
219+ length = time_horizon ,
220+ max_step_complete = True ,
221+ observation_shapes = [(1 ,)],
222+ action_space = [2 ],
223+ )
224+ for i in range (time_horizon ):
225+ trajectory .steps [i ].obs [0 ] = np .array ([large_obs2 [i ]], dtype = np .float32 )
226+
227+ trajectory_buffer = trajectory .to_agentbuffer ()
228+ policy .update_normalization (trajectory_buffer ["vector_obs" ])
229+
230+ steps , mean , variance = policy .sess .run (
231+ [policy .normalization_steps , policy .running_mean , policy .running_variance ]
232+ )
233+
234+ assert mean [0 ] == pytest .approx (
235+ np .mean (large_obs1 + large_obs2 , dtype = np .float32 ), abs = 0.01
236+ )
237+ assert variance [0 ] / steps == pytest .approx (
238+ np .var (large_obs1 + large_obs2 , dtype = np .float32 ), abs = 0.01
239+ )
240+
241+
242+ def test_normalization ():
243+ behavior_spec = mb .setup_test_behavior_specs (
244+ use_discrete = True , use_visual = False , vector_action_space = [2 ], vector_obs_space = 1
245+ )
144246 time_horizon = 6
145247 trajectory = make_fake_trajectory (
146248 length = time_horizon ,
@@ -169,10 +271,9 @@ def test_normalization():
169271
170272 assert steps == 6
171273 assert mean [0 ] == 0.5
172- # Note: variance is divided by number of steps, and initialized to 1 to avoid
173- # divide by 0. The right answer is 0.25
174- assert (variance [0 ] - 1 ) / steps == 0.25
175-
274+ # Note: variance is initalized to the variance of the initial trajectory + EPSILON
275+ # (to avoid divide by 0) and multiplied by the number of steps. The correct answer is 0.25
276+ assert variance [0 ] / steps == pytest .approx (0.25 , abs = 0.01 )
176277 # Make another update, this time with all 1's
177278 time_horizon = 10
178279 trajectory = make_fake_trajectory (
@@ -191,7 +292,7 @@ def test_normalization():
191292
192293 assert steps == 16
193294 assert mean [0 ] == 0.8125
194- assert ( variance [0 ] - 1 ) / steps == pytest .approx (0.152 , abs = 0.01 )
295+ assert variance [0 ] / steps == pytest .approx (0.152 , abs = 0.01 )
195296
196297
197298def test_min_visual_size ():
0 commit comments