Skip to content

Commit 791354a

Browse files
committed
fix ppo_gae_continuous2
1 parent 34f6f70 commit 791354a

File tree

2 files changed

+58
-10
lines changed

2 files changed

+58
-10
lines changed

ppo_gae_continuous.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def pi(self, x):
6060

6161
x = F.tanh(self.linear1(x))
6262
x = F.tanh(self.linear2(x))
63-
x1 = F.tanh(self.linear3(x).detach()) # std learning not BP to the feature
64-
x2 = F.tanh(self.linear4(x))
63+
x1 = F.tanh(self.linear3(x))
64+
x2 = F.tanh(self.linear4(x.detach())) # std learning not BP to the feature
6565

6666
mean = F.tanh(self.mean_linear(x1))
6767
log_std = self.log_std_linear(x2)
@@ -169,7 +169,7 @@ def main():
169169
env = gym.make('Ant-v2')
170170
state_dim = env.observation_space.shape[0]
171171
action_dim = env.action_space.shape[0]
172-
hidden_dim = 128
172+
hidden_dim = 64
173173
model = PPO(state_dim, action_dim, hidden_dim)
174174
score = 0.0
175175
print_interval = 2

ppo_gae_continuous2.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,45 @@ def _reverse_action(self, action):
4141

4242
return action
4343

44+
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
45+
torch.nn.init.orthogonal_(layer.weight, std)
46+
torch.nn.init.constant_(layer.bias, bias_const)
47+
return layer
48+
49+
# class PPO(nn.Module):
50+
# def __init__(self, num_inputs, num_actions, hidden_size, action_range = 1.):
51+
# super(PPO, self).__init__()
52+
# self.data = []
53+
# self.action_range = action_range
54+
# self.v_linear = nn.Sequential(
55+
# layer_init(nn.Linear(num_inputs, 64)),
56+
# nn.Tanh(),
57+
# layer_init(nn.Linear(64, 64)),
58+
# nn.Tanh(),
59+
# layer_init(nn.Linear(64, 1), std=1.0),
60+
# )
61+
# self.mean_linear = nn.Sequential(
62+
# layer_init(nn.Linear(num_inputs, 64)),
63+
# nn.Tanh(),
64+
# layer_init(nn.Linear(64, 64)),
65+
# nn.Tanh(),
66+
# layer_init(nn.Linear(64, num_actions), std=0.01),
67+
# )
68+
# self.log_std_param = nn.Parameter(torch.zeros(num_actions))
69+
70+
# self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
71+
72+
# def pi(self, x):
73+
# mean = self.mean_linear(x)
74+
# # log_std = self.log_std_linear(x)
75+
# log_std = self.log_std_param.expand_as(mean)
76+
77+
# return mean, log_std
78+
79+
# def v(self, x):
80+
# v = self.v_linear(x)
81+
# return v
82+
4483
class PPO(nn.Module):
4584
def __init__(self, num_inputs, num_actions, hidden_size, action_range = 1.):
4685
super(PPO, self).__init__()
@@ -66,8 +105,8 @@ def pi(self, x):
66105

67106
x = F.tanh(self.linear1(x))
68107
x = F.tanh(self.linear2(x))
69-
x1 = F.tanh(self.linear3(x).detach()) # std learning not BP to the feature
70-
x2 = F.tanh(self.linear4(x))
108+
x1 = F.tanh(self.linear3(x))
109+
x2 = F.tanh(self.linear4(x.detach())) # std learning not BP to the feature
71110

72111
mean = F.tanh(self.mean_linear(x1))
73112
log_std = self.log_std_linear(x2)
@@ -126,7 +165,7 @@ def make_batch(self):
126165
r_lst.append([r])
127166
s_prime_lst.append(s_prime)
128167
prob_a_lst.append([prob_a])
129-
value_lst.append([v])
168+
value_lst.append(v)
130169
done_mask = 0 if done else 1
131170
done_lst.append([done_mask])
132171
s,a,r,s_prime,v,done_mask,prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
@@ -151,8 +190,7 @@ def train_net(self):
151190

152191
if not np.isnan(advantage.std()):
153192
advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
154-
155-
td_target = advantage + self.v(s)
193+
td_target = advantage + v
156194

157195
for i in range(K_epoch):
158196
mean, log_std = self.pi(s)
@@ -173,15 +211,23 @@ def main():
173211
env = gym.make('Ant-v2')
174212
state_dim = env.observation_space.shape[0]
175213
action_dim = env.action_space.shape[0]
176-
hidden_dim = 128
214+
hidden_dim = 64
177215
model = PPO(state_dim, action_dim, hidden_dim)
178216
score = 0.0
179217
print_interval = 2
180218
step = 0
219+
update = 0
220+
n_epis = 10000
181221

182-
for n_epi in range(10000):
222+
for n_epi in range(n_epis):
183223
s = env.reset()
184224
done = False
225+
226+
## learning rate schedule
227+
# frac = 1.0 - (n_epi - 1.0) / n_epis
228+
# lrnow = frac * learning_rate
229+
# model.optimizer.param_groups[0]["lr"] = lrnow
230+
185231
# while not done:
186232
for t in range(T_horizon):
187233
step += 1
@@ -197,9 +243,11 @@ def main():
197243

198244
if (step+1) % batch_size == 0:
199245
model.train_net()
246+
update += 1
200247

201248
if done:
202249
break
250+
203251
if n_epi%print_interval==0 and n_epi!=0:
204252
print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
205253
score = 0.0

0 commit comments

Comments
 (0)