Merge branch 'dev'

seba-1511 · seba-1511 · commit 381ce9ef734d · 2019-06-07T23:42:33.000-07:00
diff --git a/cherry/_version.py b/cherry/_version.py
@@ -1 +1 @@
-__version__ = '0.0.9'
+__version__ = '0.1.0'
diff --git a/cherry/envs/.visdom_logger_wrapper.py.swp b/cherry/envs/.visdom_logger_wrapper.py.swp
diff --git a/examples/pybullet/delayed_tsac_pybullet.py b/examples/pybullet/delayed_tsac_pybullet.py
@@ -4,8 +4,6 @@
 An implementation of Soft Actor-Critic.
 """
 
-from OpenGL import GLU
-import ppt
 import copy
 import random
 import numpy as np
@@ -167,12 +165,6 @@ def update(replay,
     env.log("QF2 Loss: ", critic_qf2_loss.item())
     env.log("Average Rewards: ", batch.reward().mean().item())
 
-    # Plotting via PPT
-    '''
-    if random.random() < 0.05:
-        ppt.plot(replay[-1000:].reward().mean().item(), 'cherry true rewards - TSAC1 delayed')
-    '''
-
     # Update Critic Networks
     critic_qf1_optimizer.zero_grad()
     critic_qf1_loss.backward()
@@ -186,8 +178,8 @@ def update(replay,
     if STEP % DELAY == 0:
 
         # Policy loss
-        q_values = th.min(  critic_qf1(batch.state(), actions), 
-                            critic_qf2(batch.state(), actions)  )
+        q_values = th.min(critic_qf1(batch.state(), actions),
+                          critic_qf2(batch.state(), actions))
         policy_loss = sac.policy_loss(log_probs, q_values, alpha)
 
         env.log("Policy Loss: ", policy_loss.item())
@@ -197,14 +189,13 @@ def update(replay,
         policy_optimizer.step()
 
         # Move target approximator parameters towards critic parameters per [3]
-        ch.models.polyak_average(  source=target_qf1,
-                                   target=critic_qf1,
-                                   alpha=VF_TARGET_TAU  )
-
-        ch.models.polyak_average(  source=target_qf2,
-                                   target=critic_qf2,
-                                   alpha=VF_TARGET_TAU  )
+        ch.models.polyak_average(source=target_qf1,
+                                 target=critic_qf1,
+                                 alpha=VF_TARGET_TAU)
 
+        ch.models.polyak_average(source=target_qf2,
+                                 target=critic_qf2,
+                                 alpha=VF_TARGET_TAU)
 
 
 if __name__ == '__main__':
@@ -251,5 +242,15 @@ def update(replay,
         replay += ep_replay
         replay = replay[-REPLAY_SIZE:]
         if len(replay) > MIN_REPLAY:
-            update(replay, policy, critic_qf1, critic_qf2, target_qf1, target_qf2, log_alpha, policy_opt,
-                   qf1_opt, qf2_opt, alpha_opt, target_entropy)
+            update(replay,
+                   policy,
+                   critic_qf1,
+                   critic_qf2,
+                   target_qf1,
+                   target_qf2,
+                   log_alpha,
+                   policy_opt,
+                   qf1_opt,
+                   qf2_opt,
+                   alpha_opt,
+                   target_entropy)
diff --git a/examples/pybullet/ppo_pybullet.py b/examples/pybullet/ppo_pybullet.py
@@ -159,6 +159,7 @@ def main(env='MinitaurTrottingEnv-v0'):
 if __name__ == '__main__':
     env_name = 'CartPoleBulletEnv-v0'
     env_name = 'AntBulletEnv-v0'
+    env_name = 'HalfCheetahBulletEnv-v0'
 #    env_name = 'RoboschoolAnt-v1'
 #    env_name = 'MinitaurTrottingEnv-v0'
     main(env_name)
diff --git a/examples/pybullet/sac_pybullet.py b/examples/pybullet/sac_pybullet.py
@@ -4,14 +4,12 @@
 An implementation of Soft Actor-Critic.
 """
 
-from OpenGL import GLU
-import ppt
+#from OpenGL import GLU
 import copy
 import random
 import numpy as np
 import gym
 import pybullet_envs
-import roboschool
 
 import torch as th
 import torch.nn as nn
@@ -155,8 +153,6 @@ def update(replay,
     env.log("VF Loss: ", vf_loss.item())
     env.log("Policy Loss: ", policy_loss.item())
     env.log("Average Rewards: ", batch.reward().mean().item())
-    if random.random() < 0.05:
-        ppt.plot(replay[-1000:].reward().mean().item(), 'cherry true rewards')
 
     # Update
     qf_opt.zero_grad()
@@ -181,9 +177,9 @@ def update(replay,
     np.random.seed(SEED)
     th.manual_seed(SEED)
     env_name = 'HalfCheetahBulletEnv-v0'
-    env_name = 'RoboschoolAnt-v1'
+#    env_name = 'AntBulletEnv-v0'
     env = gym.make(env_name)
-    env = envs.Logger(env, interval=1000)
+    env = envs.VisdomLogger(env, interval=1000)
     env = envs.ActionSpaceScaler(env)
     env = envs.Torch(env)
     env = envs.Runner(env)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.0.9'`
	`1`	`+__version__ = '0.1.0'`