@@ -419,23 +419,35 @@ def compute_returns_and_advantage(self, last_values: th.Tensor, dones: np.ndarra
419
419
:param last_values: state value estimation for the last step (one for each env)
420
420
:param dones: if the last step was a terminal step (one bool for each env).
421
421
"""
422
- # Convert to numpy
422
+ # # Convert to numpy
423
+ # last_values = last_values.clone().cpu().numpy().flatten() # type: ignore[assignment]
424
+
425
+ # last_gae_lam = 0
426
+ # for step in reversed(range(self.buffer_size)):
427
+ # if step == self.buffer_size - 1:
428
+ # next_non_terminal = 1.0 - dones.astype(np.float32)
429
+ # next_values = last_values
430
+ # else:
431
+ # next_non_terminal = 1.0 - self.episode_starts[step + 1]
432
+ # next_values = self.values[step + 1]
433
+ # delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
434
+ # last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
435
+ # self.advantages[step] = last_gae_lam
436
+ # # TD(lambda) estimator, see Github PR #375 or "Telescoping in TD(lambda)"
437
+ # # in David Silver Lecture 4: https://www.youtube.com/watch?v=PnHCvfgC_ZA
438
+ # self.returns = self.advantages + self.values
439
+
423
440
last_values = last_values .clone ().cpu ().numpy ().flatten () # type: ignore[assignment]
441
+ values = np .concatenate ((self .values , last_values .reshape (1 , - 1 )))
442
+ dones = np .concatenate ((self .episode_starts , dones .reshape (1 , - 1 )))
443
+ next_non_terminal = (1.0 - dones .astype (np .float32 ))[1 :]
424
444
425
- last_gae_lam = 0
445
+ returns = [self .values [- 1 ]]
446
+ interm = self .rewards + self .gamma * (1 - self .gae_lambda ) * next_non_terminal * values [1 :]
426
447
for step in reversed (range (self .buffer_size )):
427
- if step == self .buffer_size - 1 :
428
- next_non_terminal = 1.0 - dones .astype (np .float32 )
429
- next_values = last_values
430
- else :
431
- next_non_terminal = 1.0 - self .episode_starts [step + 1 ]
432
- next_values = self .values [step + 1 ]
433
- delta = self .rewards [step ] + self .gamma * next_values * next_non_terminal - self .values [step ]
434
- last_gae_lam = delta + self .gamma * self .gae_lambda * next_non_terminal * last_gae_lam
435
- self .advantages [step ] = last_gae_lam
436
- # TD(lambda) estimator, see Github PR #375 or "Telescoping in TD(lambda)"
437
- # in David Silver Lecture 4: https://www.youtube.com/watch?v=PnHCvfgC_ZA
438
- self .returns = self .advantages + self .values
448
+ returns .append (interm [step ] + self .gamma * self .gae_lambda * next_non_terminal [step ] * returns [- 1 ])
449
+ self .returns = np .stack (list (reversed (returns ))[:- 1 ], 0 )
450
+ self .advantages = self .returns - self .values
439
451
440
452
def add (
441
453
self ,
@@ -521,6 +533,49 @@ def _get_samples(
521
533
return RolloutBufferSamples (* tuple (map (self .to_torch , data )))
522
534
523
535
536
+ class ExpRolloutBuffer (RolloutBuffer ):
537
+
538
+ def __init__ (self , buffer_size , observation_space , action_space , device = "auto" , gae_lambda = 0.95 , gamma = 0.99 , n_envs = 1 , beta = 0 ):
539
+ super ().__init__ (buffer_size , observation_space , action_space , device , gae_lambda , gamma , n_envs )
540
+ self .beta = beta
541
+
542
+ def compute_returns_and_advantage (self , last_values , dones ):
543
+
544
+ # # Convert to numpy
545
+ # last_values = last_values.clone().cpu().numpy().flatten() # type: ignore[assignment]
546
+
547
+ # last_gae_lam = 0
548
+ # for step in reversed(range(self.buffer_size)):
549
+ # if step == self.buffer_size - 1:
550
+ # next_non_terminal = 1.0 - dones.astype(np.float32)
551
+ # next_values = last_values
552
+ # else:
553
+ # next_non_terminal = 1.0 - self.episode_starts[step + 1]
554
+ # next_values = self.values[step + 1]
555
+ # delta = np.exp(self.beta * self.rewards[step] + self.gamma * np.log(1e-15 + np.maximum(next_values, 0)) * next_non_terminal) - self.values[step]
556
+ # # delta = self.rewards[step] + self.gamma * next_values * next_non_terminal - self.values[step]
557
+ # last_gae_lam = delta + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam
558
+ # self.advantages[step] = last_gae_lam
559
+ # # TD(lambda) estimator, see Github PR #375 or "Telescoping in TD(lambda)"
560
+ # # in David Silver Lecture 4: https://www.youtube.com/watch?v=PnHCvfgC_ZA
561
+ # self.returns = self.advantages + self.values
562
+
563
+
564
+ last_values = last_values .clone ().cpu ().numpy ().flatten () # type: ignore[assignment]
565
+ values = np .concatenate ((self .values , last_values .reshape (1 , - 1 )))
566
+ dones = np .concatenate ((self .episode_starts , dones .reshape (1 , - 1 )))
567
+ next_non_terminal = (1.0 - dones .astype (np .float32 ))[1 :]
568
+
569
+ returns = [self .values [- 1 ]]
570
+ interm = self .beta * self .rewards + self .gamma * (1 - self .gae_lambda ) * next_non_terminal * np .log (1e-15 + np .maximum (0 , values [1 :]))
571
+ for step in reversed (range (self .buffer_size )):
572
+ returns .append (np .exp (interm [step ] + self .gamma * self .gae_lambda * next_non_terminal [step ] * np .log (1e-15 + np .maximum (0 , returns [- 1 ]))))
573
+ self .returns = np .stack (list (reversed (returns ))[:- 1 ], 0 )
574
+ self .advantages = self .returns - self .values
575
+
576
+
577
+
578
+
524
579
class DictReplayBuffer (ReplayBuffer ):
525
580
"""
526
581
Dict Replay buffer used in off-policy algorithms like SAC/TD3.
0 commit comments