9
9
'''
10
10
11
11
12
- import math
13
12
import random
14
-
15
13
import gym
16
14
import numpy as np
17
-
18
15
import torch
19
16
import torch .nn as nn
20
17
import torch .optim as optim
21
18
import torch .nn .functional as F
22
19
from torch .distributions import Categorical
23
-
24
20
from IPython .display import clear_output
25
21
import matplotlib .pyplot as plt
26
- from matplotlib import animation
27
- from IPython .display import display
28
-
29
22
import argparse
30
- import time
31
23
32
24
GPU = True
33
25
device_idx = 0
@@ -104,7 +96,7 @@ def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_mi
104
96
105
97
self .num_actions = num_actions
106
98
107
- def forward (self , state , softmax_dim = 0 ):
99
+ def forward (self , state , softmax_dim = - 1 ):
108
100
x = F .tanh (self .linear1 (state ))
109
101
x = F .tanh (self .linear2 (x ))
110
102
# x = F.tanh(self.linear3(x))
@@ -183,7 +175,7 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy
183
175
# reward = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem
184
176
185
177
# Training Q Function
186
- # print((next_log_prob.exp()* self.target_soft_q_net2(next_state)).shape, next_log_prob.shape )
178
+ self .alpha = self . log_alpha . exp ( )
187
179
target_q_min = (next_log_prob .exp () * (torch .min (self .target_soft_q_net1 (next_state ),self .target_soft_q_net2 (next_state )) - self .alpha * next_log_prob )).sum (dim = - 1 ).unsqueeze (- 1 )
188
180
target_q_value = reward + (1 - done ) * gamma * target_q_min # if done==1, only reward
189
181
q_value_loss1 = self .soft_q_criterion1 (predicted_q_value1 , target_q_value .detach ()) # detach: no gradients for the variable
@@ -203,19 +195,6 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy
203
195
self .policy_optimizer .zero_grad ()
204
196
policy_loss .backward ()
205
197
self .policy_optimizer .step ()
206
-
207
- # print('q loss: ', q_value_loss1, q_value_loss2)
208
- # print('policy loss: ', policy_loss )
209
-
210
- # Soft update the target value net
211
- for target_param , param in zip (self .target_soft_q_net1 .parameters (), self .soft_q_net1 .parameters ()):
212
- target_param .data .copy_ ( # copy data value into target parameters
213
- target_param .data * (1.0 - soft_tau ) + param .data * soft_tau
214
- )
215
- for target_param , param in zip (self .target_soft_q_net2 .parameters (), self .soft_q_net2 .parameters ()):
216
- target_param .data .copy_ ( # copy data value into target parameters
217
- target_param .data * (1.0 - soft_tau ) + param .data * soft_tau
218
- )
219
198
220
199
# Updating alpha wrt entropy
221
200
# alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q)
@@ -225,10 +204,22 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy
225
204
self .alpha_optimizer .zero_grad ()
226
205
alpha_loss .backward ()
227
206
self .alpha_optimizer .step ()
228
- self .alpha = self .log_alpha .exp ()
229
207
else :
230
208
self .alpha = 1.
231
209
alpha_loss = 0
210
+
211
+ # print('q loss: ', q_value_loss1.item(), q_value_loss2.item())
212
+ # print('policy loss: ', policy_loss.item() )
213
+
214
+ # Soft update the target value net
215
+ for target_param , param in zip (self .target_soft_q_net1 .parameters (), self .soft_q_net1 .parameters ()):
216
+ target_param .data .copy_ ( # copy data value into target parameters
217
+ target_param .data * (1.0 - soft_tau ) + param .data * soft_tau
218
+ )
219
+ for target_param , param in zip (self .target_soft_q_net2 .parameters (), self .soft_q_net2 .parameters ()):
220
+ target_param .data .copy_ ( # copy data value into target parameters
221
+ target_param .data * (1.0 - soft_tau ) + param .data * soft_tau
222
+ )
232
223
233
224
return predicted_new_q_value .mean ()
234
225
@@ -266,7 +257,7 @@ def plot(rewards):
266
257
267
258
# hyper-parameters for RL training
268
259
max_episodes = 10000
269
- max_steps = 100
260
+ max_steps = 200
270
261
frame_idx = 0
271
262
batch_size = 256
272
263
update_itr = 1
@@ -287,7 +278,6 @@ def plot(rewards):
287
278
state = env .reset ()
288
279
episode_reward = 0
289
280
290
-
291
281
for step in range (max_steps ):
292
282
action = sac_trainer .policy_net .get_action (state , deterministic = DETERMINISTIC )
293
283
next_state , reward , done , _ = env .step (action )
0 commit comments