quantumiracle
diff --git a/‎sac_discrete.py‎
Lines changed: 16 additions & 26 deletions b/‎sac_discrete.py‎
Lines changed: 16 additions & 26 deletions
diff --git a/‎sac_v2.png‎
-64.9 KB b/‎sac_v2.png‎
-64.9 KB
@@ -9,25 +9,17 @@
 '''
 
 
-import math
 import random
-
 import gym
 import numpy as np
-
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import torch.nn.functional as F
 from torch.distributions import Categorical
-
 from IPython.display import clear_output
 import matplotlib.pyplot as plt
-from matplotlib import animation
-from IPython.display import display
-
 import argparse
-import time
 
 GPU = True
 device_idx = 0
@@ -104,7 +96,7 @@ def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_mi
 
         self.num_actions = num_actions
 
-    def forward(self, state, softmax_dim=0):
+    def forward(self, state, softmax_dim=-1):
         x = F.tanh(self.linear1(state))
         x = F.tanh(self.linear2(x))
         # x = F.tanh(self.linear3(x))
@@ -183,7 +175,7 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy
         # reward = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem
 
     # Training Q Function
-        # print((next_log_prob.exp()*self.target_soft_q_net2(next_state)).shape,  next_log_prob.shape)
+        self.alpha = self.log_alpha.exp()
         target_q_min = (next_log_prob.exp() * (torch.min(self.target_soft_q_net1(next_state),self.target_soft_q_net2(next_state)) - self.alpha * next_log_prob)).sum(dim=-1).unsqueeze(-1)
         target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
         q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach())  # detach: no gradients for the variable
@@ -203,19 +195,6 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy
         self.policy_optimizer.zero_grad()
         policy_loss.backward()
         self.policy_optimizer.step()
-        
-        # print('q loss: ', q_value_loss1, q_value_loss2)
-        # print('policy loss: ', policy_loss )
-
-    # Soft update the target value net
-        for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
-            target_param.data.copy_(  # copy data value into target parameters
-                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
-            )
-        for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
-            target_param.data.copy_(  # copy data value into target parameters
-                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
-            )
 
         # Updating alpha wrt entropy
         # alpha = 0.0  # trade-off between exploration (max entropy) and exploitation (max Q) 
@@ -225,10 +204,22 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy
             self.alpha_optimizer.zero_grad()
             alpha_loss.backward()
             self.alpha_optimizer.step()
-            self.alpha = self.log_alpha.exp()
         else:
             self.alpha = 1.
             alpha_loss = 0
+        
+        # print('q loss: ', q_value_loss1.item(), q_value_loss2.item())
+        # print('policy loss: ', policy_loss.item() )
+
+    # Soft update the target value net
+        for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
+            target_param.data.copy_(  # copy data value into target parameters
+                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
+            )
+        for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
+            target_param.data.copy_(  # copy data value into target parameters
+                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
+            )
 
         return predicted_new_q_value.mean()
 
@@ -266,7 +257,7 @@ def plot(rewards):
 
 # hyper-parameters for RL training
 max_episodes  = 10000
-max_steps = 100
+max_steps = 200
 frame_idx   = 0
 batch_size  = 256
 update_itr = 1
@@ -287,7 +278,6 @@ def plot(rewards):
             state =  env.reset()
             episode_reward = 0
 
-            
             for step in range(max_steps):
                 action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC)
                 next_state, reward, done, _ = env.step(action)