Mani tuned

Joseph Suarez · Joseph Suarez · commit f0f952aed172 · 2025-06-28T18:27:54.000Z
diff --git a/pufferlib/config/mani_skill.ini b/pufferlib/config/mani_skill.ini
@@ -1,7 +1,7 @@
 
 [base]
 package = mani_skill
-env_name = mani_pickcube
+env_name = mani_pickcube mani_pushcube mani_peginsertion
 policy_name = Policy
 rnn_name = Recurrent
 
@@ -13,14 +13,35 @@ backend = PufferEnv
 num_envs = 1
 
 [train]
-total_timesteps = 10_000_000
-bptt_horizon = 32
-gamma = 0.8
-gae_lamda = 0.9
+total_timesteps = 100_000_000
+adam_beta1 = 0.9832254546070032
+adam_beta2 = 0.9996089758513379
+adam_eps = 0.0000024542110227211678
+bptt_horizon = 64
+clip_coef = 0.6609987983481933
+ent_coef = 0.001194131610607018
+gae_lambda = 0.968478898646462
+gamma = 0.8880001899050386
+learning_rate = 0.04729013902338006
+max_grad_norm = 1.9301595176438802
+minibatch_size = 32768
+prio_alpha = 0.9531362058849446
+prio_beta0 = 0.8285186322612919
+vf_clip_coef = 0.2581908677409054
+vf_coef = 2.6102252379894217
+vtrace_c_clip = 2.008516783867587
+vtrace_rho_clip = 0.7482202150166445
 
 [sweep]
 method = Protein 
 metric = success_once
 downsample = 0
 
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 5e6
+max = 15e6
+mean = 10e6
+scale = time
+
 
diff --git a/pufferlib/environments/mani_skill/environment.py b/pufferlib/environments/mani_skill/environment.py
@@ -12,6 +12,8 @@
 
 ALIASES = {
     'mani_pickcube': 'PickCube-v1',
+    'mani_pushcube': 'PushCube-v1',
+    'mani_peginsertion': 'PegInsertionSide-v1',
 }
 
 def env_creator(name='PickCube-v1'):
@@ -27,7 +29,7 @@ def make(name, num_envs=1, render_mode='rgb_array', buf=None, seed=0):
 
 class ManiPufferEnv(pufferlib.PufferEnv):
     def __init__(self, name, num_envs=1, render_mode='rgb_array', log_interval=16, buf=None, seed=0):
-        self.env = gym.make(name, num_envs=num_envs, render_mode=render_mode)
+        self.env = gym.make(name, reward_mode='delta', num_envs=num_envs, render_mode=render_mode)
         self.env = ManiSkillVectorEnv(self.env, auto_reset=True, ignore_terminations=False, record_metrics=True)
         self.agents_per_batch = num_envs
 
diff --git a/pufferlib/environments/mani_skill/torch.py b/pufferlib/environments/mani_skill/torch.py
@@ -1,2 +1,71 @@
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+import pufferlib
 from pufferlib.models import Default as Policy
 from pufferlib.models import LSTMWrapper as Recurrent
+
+class FakePolicy(nn.Module):
+    '''Default PyTorch policy. Flattens obs and applies a linear layer.
+
+    PufferLib is not a framework. It does not enforce a base class.
+    You can use any PyTorch policy that returns actions and values.
+    We structure our forward methods as encode_observations and decode_actions
+    to make it easier to wrap policies with LSTMs. You can do that and use
+    our LSTM wrapper or implement your own. To port an existing policy
+    for use with our LSTM wrapper, simply put everything from forward() before
+    the recurrent cell into encode_observations and put everything after
+    into decode_actions.
+    '''
+    def __init__(self, env, hidden_size=256):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        n_obs = np.prod(env.single_observation_space.shape)
+        n_atn = env.single_action_space.shape[0]
+        self.decoder_mean = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(n_obs, 256)),
+            nn.Tanh(),
+            pufferlib.pytorch.layer_init(nn.Linear(256, 256)),
+            nn.Tanh(),
+            pufferlib.pytorch.layer_init(nn.Linear(256, 256)),
+            nn.Tanh(),
+            pufferlib.pytorch.layer_init(nn.Linear(256, n_atn), std=0.01),
+        )
+        self.decoder_logstd = nn.Parameter(torch.zeros(
+            1, env.single_action_space.shape[0]))
+
+        self.value = nn.Sequential(
+            pufferlib.pytorch.layer_init(nn.Linear(n_obs, 256)),
+            nn.Tanh(),
+            pufferlib.pytorch.layer_init(nn.Linear(256, 256)),
+            nn.Tanh(),
+            pufferlib.pytorch.layer_init(nn.Linear(256, 256)),
+            nn.Tanh(),
+            pufferlib.pytorch.layer_init(nn.Linear(256, 1), std=1),
+        )
+ 
+    def forward_eval(self, observations, state=None):
+        hidden = self.encode_observations(observations, state=state)
+        logits, values = self.decode_actions(hidden)
+        return logits, values
+
+    def forward(self, observations, state=None):
+        return self.forward_eval(observations, state)
+
+    def encode_observations(self, observations, state=None):
+        '''Encodes a batch of observations into hidden states. Assumes
+        no time dimension (handled by LSTM wrappers).'''
+        return observations
+
+    def decode_actions(self, hidden):
+        '''Decodes a batch of hidden states into (multi)discrete actions.
+        Assumes no time dimension (handled by LSTM wrappers).'''
+        mean = self.decoder_mean(hidden)
+        logstd = self.decoder_logstd.expand_as(mean)
+        std = torch.exp(logstd)
+        logits = torch.distributions.Normal(mean, std)
+        values = self.value(hidden)
+        return logits, values