9
9
10
10
#Hyperparameters
11
11
learning_rate = 1e-4
12
- gamma = 0.98
12
+ gamma = 0.99
13
13
lmbda = 0.95
14
14
eps_clip = 0.1
15
15
batch_size = 4096
16
- K_epoch = 3
16
+ K_epoch = 20
17
17
T_horizon = 10000
18
18
19
19
class NormalizedActions (gym .ActionWrapper ):
@@ -50,6 +50,8 @@ def __init__(self, num_inputs, num_actions, hidden_size, action_range = 1.):
50
50
51
51
self .mean_linear = nn .Linear (hidden_size , num_actions )
52
52
self .log_std_linear = nn .Linear (hidden_size , num_actions )
53
+ # self.log_std_param = nn.Parameter(torch.zeros(num_actions))
54
+
53
55
self .v_linear = nn .Linear (hidden_size , 1 )
54
56
55
57
self .optimizer = optim .Adam (self .parameters (), lr = learning_rate )
@@ -63,6 +65,7 @@ def pi(self, x):
63
65
64
66
mean = F .tanh (self .mean_linear (x1 ))
65
67
log_std = self .log_std_linear (x2 )
68
+ # log_std = self.log_std_param.expand_as(mean)
66
69
67
70
return mean , log_std
68
71
@@ -129,27 +132,24 @@ def train_net(self):
129
132
s , a , r , s_prime , done_mask , prob_a = self .make_batch ()
130
133
done_mask_ = torch .flip (done_mask , dims = (0 ,))
131
134
132
- for i in range (K_epoch ):
133
- td_target = r + gamma * self .v (s_prime ) * done_mask
134
- delta = td_target - self .v (s )
135
- delta = delta .detach ().numpy ()
136
-
137
- advantage_lst = []
138
- advantage = 0.0
139
- for delta_t , mask in zip (delta [::- 1 ], done_mask_ ):
140
- advantage = gamma * lmbda * advantage * mask + delta_t [0 ]
141
- advantage_lst .append ([advantage ])
142
- advantage_lst .reverse ()
143
- advantage = torch .tensor (advantage_lst , dtype = torch .float )
144
- if not np .isnan (advantage .std ()):
145
- advantage = (advantage - advantage .mean ()) / (advantage .std () + 1e-5 )
146
-
135
+ # put target value computation before epoch update reduce computation and stabilize training
136
+ td_target = r + gamma * self .v (s_prime ) * done_mask
137
+ delta = td_target - self .v (s )
138
+ delta = delta .detach ().numpy ()
139
+
140
+ advantage_lst = []
141
+ advantage = 0.0
142
+ for delta_t , mask in zip (delta [::- 1 ], done_mask_ ):
143
+ advantage = gamma * lmbda * advantage * mask + delta_t [0 ]
144
+ advantage_lst .append ([advantage ])
145
+ advantage_lst .reverse ()
146
+ advantage = torch .tensor (advantage_lst , dtype = torch .float )
147
+ if not np .isnan (advantage .std ()):
148
+ advantage = (advantage - advantage .mean ()) / (advantage .std () + 1e-5 )
149
+
150
+ for i in range (K_epoch ):
147
151
mean , log_std = self .pi (s )
148
- try :
149
- log_pi_a = self .get_log_prob (mean , log_std , a )
150
- except :
151
- print (s , a )
152
- print (mean , log_std )
152
+ log_pi_a = self .get_log_prob (mean , log_std , a )
153
153
# pi = self.pi(s, softmax_dim=1)
154
154
# pi_a = pi.gather(1,a)
155
155
ratio = torch .exp (log_pi_a - torch .log (prob_a )) # a/b == exp(log(a)-log(b))
0 commit comments