@@ -41,6 +41,45 @@ def _reverse_action(self, action):
41
41
42
42
return action
43
43
44
+ def layer_init (layer , std = np .sqrt (2 ), bias_const = 0.0 ):
45
+ torch .nn .init .orthogonal_ (layer .weight , std )
46
+ torch .nn .init .constant_ (layer .bias , bias_const )
47
+ return layer
48
+
49
+ # class PPO(nn.Module):
50
+ # def __init__(self, num_inputs, num_actions, hidden_size, action_range = 1.):
51
+ # super(PPO, self).__init__()
52
+ # self.data = []
53
+ # self.action_range = action_range
54
+ # self.v_linear = nn.Sequential(
55
+ # layer_init(nn.Linear(num_inputs, 64)),
56
+ # nn.Tanh(),
57
+ # layer_init(nn.Linear(64, 64)),
58
+ # nn.Tanh(),
59
+ # layer_init(nn.Linear(64, 1), std=1.0),
60
+ # )
61
+ # self.mean_linear = nn.Sequential(
62
+ # layer_init(nn.Linear(num_inputs, 64)),
63
+ # nn.Tanh(),
64
+ # layer_init(nn.Linear(64, 64)),
65
+ # nn.Tanh(),
66
+ # layer_init(nn.Linear(64, num_actions), std=0.01),
67
+ # )
68
+ # self.log_std_param = nn.Parameter(torch.zeros(num_actions))
69
+
70
+ # self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
71
+
72
+ # def pi(self, x):
73
+ # mean = self.mean_linear(x)
74
+ # # log_std = self.log_std_linear(x)
75
+ # log_std = self.log_std_param.expand_as(mean)
76
+
77
+ # return mean, log_std
78
+
79
+ # def v(self, x):
80
+ # v = self.v_linear(x)
81
+ # return v
82
+
44
83
class PPO (nn .Module ):
45
84
def __init__ (self , num_inputs , num_actions , hidden_size , action_range = 1. ):
46
85
super (PPO , self ).__init__ ()
@@ -66,8 +105,8 @@ def pi(self, x):
66
105
67
106
x = F .tanh (self .linear1 (x ))
68
107
x = F .tanh (self .linear2 (x ))
69
- x1 = F .tanh (self .linear3 (x ). detach ()) # std learning not BP to the feature
70
- x2 = F .tanh (self .linear4 (x ))
108
+ x1 = F .tanh (self .linear3 (x ))
109
+ x2 = F .tanh (self .linear4 (x . detach ())) # std learning not BP to the feature
71
110
72
111
mean = F .tanh (self .mean_linear (x1 ))
73
112
log_std = self .log_std_linear (x2 )
@@ -126,7 +165,7 @@ def make_batch(self):
126
165
r_lst .append ([r ])
127
166
s_prime_lst .append (s_prime )
128
167
prob_a_lst .append ([prob_a ])
129
- value_lst .append ([ v ] )
168
+ value_lst .append (v )
130
169
done_mask = 0 if done else 1
131
170
done_lst .append ([done_mask ])
132
171
s ,a ,r ,s_prime ,v ,done_mask ,prob_a = torch .tensor (s_lst , dtype = torch .float ), torch .tensor (a_lst ), \
@@ -151,8 +190,7 @@ def train_net(self):
151
190
152
191
if not np .isnan (advantage .std ()):
153
192
advantage = (advantage - advantage .mean ()) / (advantage .std () + 1e-8 )
154
-
155
- td_target = advantage + self .v (s )
193
+ td_target = advantage + v
156
194
157
195
for i in range (K_epoch ):
158
196
mean , log_std = self .pi (s )
@@ -173,15 +211,23 @@ def main():
173
211
env = gym .make ('Ant-v2' )
174
212
state_dim = env .observation_space .shape [0 ]
175
213
action_dim = env .action_space .shape [0 ]
176
- hidden_dim = 128
214
+ hidden_dim = 64
177
215
model = PPO (state_dim , action_dim , hidden_dim )
178
216
score = 0.0
179
217
print_interval = 2
180
218
step = 0
219
+ update = 0
220
+ n_epis = 10000
181
221
182
- for n_epi in range (10000 ):
222
+ for n_epi in range (n_epis ):
183
223
s = env .reset ()
184
224
done = False
225
+
226
+ ## learning rate schedule
227
+ # frac = 1.0 - (n_epi - 1.0) / n_epis
228
+ # lrnow = frac * learning_rate
229
+ # model.optimizer.param_groups[0]["lr"] = lrnow
230
+
185
231
# while not done:
186
232
for t in range (T_horizon ):
187
233
step += 1
@@ -197,9 +243,11 @@ def main():
197
243
198
244
if (step + 1 ) % batch_size == 0 :
199
245
model .train_net ()
246
+ update += 1
200
247
201
248
if done :
202
249
break
250
+
203
251
if n_epi % print_interval == 0 and n_epi != 0 :
204
252
print ("# of episode :{}, avg score : {:.1f}" .format (n_epi , score / print_interval ))
205
253
score = 0.0
0 commit comments