Skip to content
4 changes: 3 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ jobs:
run: |
python -m pip install --upgrade pip
# cpu version of pytorch - faster to download
pip install torch==1.4.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
# Install gSDE branch
pip install git+https://github.com/DLR-RM/stable-baselines3@sde
# faster to install because pre-built wheel
pip install pybullet==2.8.4
pip install -r requirements.txt
Expand Down
3 changes: 3 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ variables:

type-check:
script:
# Install gSDE branch
- pip install git+https://github.com/DLR-RM/stable-baselines3@sde
- make type

pytest:
script:
- pip install git+https://github.com/DLR-RM/stable-baselines3@sde
# MKL_THREADING_LAYER=GNU to avoid MKL_THREADING_LAYER=INTEL incompatibility error
- MKL_THREADING_LAYER=GNU make pytest

Expand Down
163 changes: 84 additions & 79 deletions hyperparams/td3.yml
Original file line number Diff line number Diff line change
@@ -1,62 +1,68 @@
# Tuned
MountainCarContinuous-v0:
n_timesteps: 300000
normalize: True
n_timesteps: 30000
policy: 'MlpPolicy'
noise_type: 'ornstein-uhlenbeck'
noise_std: 0.5
learning_rate: !!float 3e-4
buffer_size: 50000
batch_size: 256
n_episodes_rollout: -1
gradient_steps: 8
train_freq: 8
learning_starts: 0
use_sde: True
policy_kwargs: "dict(log_std_init=0.0, net_arch=[64, 64])"

Pendulum-v0:
n_timesteps: 20000
policy: 'MlpPolicy'
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

LunarLanderContinuous-v2:
n_timesteps: !!float 3e5
policy: 'MlpPolicy'
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

BipedalWalker-v3:
n_timesteps: !!float 1e6
policy: 'MlpPolicy'
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

# To be tuned
BipedalWalkerHardcore-v3:
n_timesteps: !!float 1e7
policy: 'MlpPolicy'
gamma: 0.98
buffer_size: 200000
buffer_size: 500000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3, use_expln=True)"
use_sde: True

# Tuned
HalfCheetahBulletEnv-v0:
Expand All @@ -66,12 +72,12 @@ HalfCheetahBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

AntBulletEnv-v0:
env_wrapper: utils.wrappers.TimeFeatureWrapper
Expand All @@ -80,12 +86,12 @@ AntBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

HopperBulletEnv-v0:
env_wrapper: utils.wrappers.TimeFeatureWrapper
Expand All @@ -94,12 +100,12 @@ HopperBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

Walker2DBulletEnv-v0:
env_wrapper: utils.wrappers.TimeFeatureWrapper
Expand All @@ -108,13 +114,12 @@ Walker2DBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

# TO BE tested
HumanoidBulletEnv-v0:
Expand All @@ -124,12 +129,12 @@ HumanoidBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

# Tuned
ReacherBulletEnv-v0:
Expand All @@ -139,12 +144,12 @@ ReacherBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

# Tuned
InvertedDoublePendulumBulletEnv-v0:
Expand All @@ -154,12 +159,12 @@ InvertedDoublePendulumBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

# Tuned
InvertedPendulumSwingupBulletEnv-v0:
Expand All @@ -169,12 +174,12 @@ InvertedPendulumSwingupBulletEnv-v0:
gamma: 0.98
buffer_size: 200000
learning_starts: 10000
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
learning_rate: !!float 6e-4
policy_kwargs: "dict(net_arch=[400, 300], log_std_init=-3.62, lr_sde=1.5e-3)"
use_sde: True

MinitaurBulletEnv-v0:
env_wrapper: utils.wrappers.TimeFeatureWrapper
Expand Down
Loading