commit before working on other stuff

Your Name · Your Name · commit 658d87b29720 · 2025-07-09T19:48:20.000Z
diff --git a/examples/lerobot/robodm_training_pipeline.py b/examples/lerobot/robodm_training_pipeline.py
@@ -95,8 +95,8 @@ def _convert_trajectory(self, trajectory: Dict[str, Any], episode_idx: int) -> L
         else:
             return []  # No valid data found
         
-        # DiffusionPolicy expects sequences, so we need horizon=16 for actions
-        horizon = 16
+        # DiffusionPolicy expects sequences with full prediction horizon
+        horizon = 16  # This should match DiffusionPolicy's horizon (not n_action_steps)
         timesteps = []
         
         # Create training samples with action sequences
@@ -138,19 +138,22 @@ def _convert_trajectory(self, trajectory: Dict[str, Any], episode_idx: int) -> L
                             action_is_pad_sequence.append(False)
                         else:
                             # Pad with zeros
-                            action_sequence.append(torch.zeros(2, dtype=torch.float32))  # Assuming 2D actions
+                            action_dim = action_data.shape[0] if hasattr(action_data, 'shape') else 2
+                            action_sequence.append(torch.zeros(action_dim, dtype=torch.float32))
                             action_is_pad_sequence.append(True)
                     else:
                         # Pad with zeros when we run out of actions
-                        action_sequence.append(torch.zeros(2, dtype=torch.float32))  # Assuming 2D actions
+                        action_dim = action_sequence[0].shape[0] if action_sequence else 2
+                        action_sequence.append(torch.zeros(action_dim, dtype=torch.float32))
                         action_is_pad_sequence.append(True)
                 
                 # Stack into sequence tensors
                 timestep['action'] = torch.stack(action_sequence)  # Shape: [horizon, action_dim]
                 timestep['action_is_pad'] = torch.tensor(action_is_pad_sequence, dtype=torch.bool)  # Shape: [horizon]
             else:
-                # No action data at all
-                timestep['action'] = torch.zeros(horizon, 2, dtype=torch.float32)  # Shape: [horizon, action_dim]
+                # No action data at all - use default action dimension
+                default_action_dim = 2  # You should adjust this to match your robot's action space
+                timestep['action'] = torch.zeros(horizon, default_action_dim, dtype=torch.float32)  # Shape: [horizon, action_dim]
                 timestep['action_is_pad'] = torch.ones(horizon, dtype=torch.bool)  # All padded
             
             timesteps.append(timestep)
@@ -176,10 +179,30 @@ def _add_image_observation_sequences(self, timestep: Dict[str, torch.Tensor], tr
                         # Make a copy to ensure the array is writable
                         image_data = image_data.copy()
                         # Convert to tensor, ensure it's in CHW format
-                        if len(image_data.shape) == 3 and image_data.shape[2] == 3:  # HWC format
-                            image_tensor = torch.from_numpy(image_data).permute(2, 0, 1).float() / 255.0
-                        else:  # Already in CHW format
-                            image_tensor = torch.from_numpy(image_data).float() / 255.0
+                        if len(image_data.shape) == 3:
+                            # Check if it's HWC format (height, width, channels)
+                            if image_data.shape[2] == 3:  # HWC format
+                                image_tensor = torch.from_numpy(image_data).permute(2, 0, 1).float() / 255.0
+                            elif image_data.shape[0] == 3:  # Already CHW format
+                                image_tensor = torch.from_numpy(image_data).float() / 255.0
+                            else:
+                                # Unknown format, assume HWC and convert
+                                image_tensor = torch.from_numpy(image_data).permute(2, 0, 1).float() / 255.0
+                        else:
+                            # Handle 2D images by adding channel dimension
+                            if len(image_data.shape) == 2:
+                                image_tensor = torch.from_numpy(image_data).unsqueeze(0).float() / 255.0
+                            else:
+                                # Fallback: try to reshape to CHW format
+                                image_tensor = torch.from_numpy(image_data).float() / 255.0
+                                if image_tensor.dim() == 1:
+                                    # Try to reshape to square image
+                                    size = int(np.sqrt(image_tensor.shape[0] / 3))
+                                    if size * size * 3 == image_tensor.shape[0]:
+                                        image_tensor = image_tensor.view(3, size, size)
+                                    else:
+                                        # Create placeholder if can't reshape
+                                        image_tensor = torch.zeros(3, 96, 96, dtype=torch.float32)
                         image_sequence.append(image_tensor)
                     else:
                         # Create a placeholder image if no image data
@@ -211,17 +234,37 @@ def _add_image_observations(self, timestep: Dict[str, torch.Tensor], trajectory:
                     # Make a copy to ensure the array is writable
                     image_data = image_data.copy()
                     # Convert to tensor, ensure it's in CHW format
-                    if len(image_data.shape) == 3 and image_data.shape[2] == 3:  # HWC format
-                        image_tensor = torch.from_numpy(image_data).permute(2, 0, 1).float() / 255.0
-                    else:  # Already in CHW format
-                        image_tensor = torch.from_numpy(image_data).float() / 255.0
+                    if len(image_data.shape) == 3:
+                        # Check if it's HWC format (height, width, channels)
+                        if image_data.shape[2] == 3:  # HWC format
+                            image_tensor = torch.from_numpy(image_data).permute(2, 0, 1).float() / 255.0
+                        elif image_data.shape[0] == 3:  # Already CHW format
+                            image_tensor = torch.from_numpy(image_data).float() / 255.0
+                        else:
+                            # Unknown format, assume HWC and convert
+                            image_tensor = torch.from_numpy(image_data).permute(2, 0, 1).float() / 255.0
+                    else:
+                        # Handle 2D images by adding channel dimension
+                        if len(image_data.shape) == 2:
+                            image_tensor = torch.from_numpy(image_data).unsqueeze(0).float() / 255.0
+                        else:
+                            # Fallback: try to reshape to CHW format
+                            image_tensor = torch.from_numpy(image_data).float() / 255.0
+                            if image_tensor.dim() == 1:
+                                # Try to reshape to square image
+                                size = int(np.sqrt(image_tensor.shape[0] / 3))
+                                if size * size * 3 == image_tensor.shape[0]:
+                                    image_tensor = image_tensor.view(3, size, size)
+                                else:
+                                    # Create placeholder if can't reshape
+                                    image_tensor = torch.zeros(3, 96, 96, dtype=torch.float32)
                     timestep['observation.image'] = image_tensor
                 else:
                     # Create a placeholder image if no image data
-                    timestep['observation.image'] = torch.zeros(3, 64, 64, dtype=torch.float32)
+                    timestep['observation.image'] = torch.zeros(3, 96, 96, dtype=torch.float32)
             else:
                 # Create a placeholder image if frame is out of range
-                timestep['observation.image'] = torch.zeros(3, 64, 64, dtype=torch.float32)
+                timestep['observation.image'] = torch.zeros(3, 96, 96, dtype=torch.float32)
     
     def get_torch_dataset(self) -> torch_data.Dataset:
         """Get PyTorch dataset."""
@@ -296,6 +339,10 @@ def get_dataset_stats(self) -> Dict[str, Dict[str, torch.Tensor]]:
         if all_actions:
             try:
                 actions = torch.stack(all_actions)
+                # Transpose actions from [samples, horizon, action_dim] to [samples, action_dim, horizon]
+                # to match the expected format for DiffusionPolicy
+                if len(actions.shape) == 3:
+                    actions = actions.transpose(1, 2)  # [samples, action_dim, horizon]
                 stats['action'] = {
                     'mean': actions.mean(dim=0),
                     'std': actions.std(dim=0),
diff --git a/examples/lerobot/run_pipeline.py b/examples/lerobot/run_pipeline.py
@@ -149,64 +149,58 @@ def run_complete_pipeline(dataset_name: str, num_episodes: int = None,
             policy_features = pipeline.get_policy_features()
             dataset_stats = pipeline.get_dataset_stats()
             
+            
             # Create policy configuration
             cfg = DiffusionConfig(
                 input_features=policy_features['input_features'],
                 output_features=policy_features['output_features'],
-                crop_shape=None  # Disable cropping since our images are 96x96
+                crop_shape=None,  # Disable cropping since our images are 96x96
+                horizon=16  # Match the horizon used in RoboDM data generation
             )
             
+            
             # Create and setup policy
             policy = DiffusionPolicy(cfg, dataset_stats=dataset_stats)
             policy.train()
             policy.to(device)
             
-            # Setup training with custom collate function for DiffusionPolicy
+            # Use observation sequence collate function for DiffusionPolicy
+            from torch.utils.data import default_collate
+            
             def collate_fn(batch):
-                """Custom collate function that creates observation sequences for DiffusionPolicy."""
-                result = {}
+                """Collate function for DiffusionPolicy training with RoboDM data."""
+                if not batch:
+                    return {}
+                
+                # Use default collate for everything
+                from torch.utils.data import default_collate
+                collated = default_collate(batch)
+                
                 batch_size = len(batch)
                 n_obs_steps = 2  # DiffusionPolicy default
                 
-                # Stack all non-sequence keys normally
-                for key in batch[0].keys():
-                    if key not in ['observation.image', 'observation.state']:
-                        values = [item[key] for item in batch if item[key] is not None]
-                        if values and all(isinstance(v, torch.Tensor) for v in values):
-                            try:
-                                result[key] = torch.stack(values)
-                            except RuntimeError:
-                                result[key] = values[0].unsqueeze(0).repeat(len(batch), *([1] * (values[0].dim())))
-                        elif values:
-                            result[key] = values[0] if len(values) == 1 else values
+                # Create observation sequences for DiffusionPolicy
+                if 'observation.image' in collated:
+                    # Images: [B, C, H, W] -> [B, T, C, H, W]
+                    images = collated['observation.image']
+                    # Create temporal sequence by repeating current observation
+                    image_seq = images.unsqueeze(1).repeat(1, n_obs_steps, 1, 1, 1)
+                    collated['observation.image'] = image_seq
                 
-                # Handle observation sequences specially
-                if 'observation.image' in batch[0]:
-                    # Create observation.images with proper sequence format
-                    images = []
-                    for i in range(batch_size):
-                        # Get current observation
-                        current_obs = batch[i]['observation.image']
-                        # For simplicity, repeat current observation for n_obs_steps
-                        # In a proper implementation, you'd track actual historical observations
-                        obs_sequence = current_obs.unsqueeze(0).repeat(n_obs_steps, 1, 1, 1)  # [n_obs_steps, C, H, W]
-                        obs_sequence = obs_sequence.unsqueeze(1)  # Add camera dim: [n_obs_steps, 1, C, H, W]
-                        images.append(obs_sequence)
-                    result['observation.images'] = torch.stack(images)  # [B, n_obs_steps, 1, C, H, W]
+                if 'observation.state' in collated:
+                    # States: [B, state_dim] -> [B, T, state_dim]
+                    states = collated['observation.state']
+                    state_seq = states.unsqueeze(1).repeat(1, n_obs_steps, 1)
+                    collated['observation.state'] = state_seq
                 
-                if 'observation.state' in batch[0]:
-                    # Create observation.state sequence
-                    states = []
-                    for i in range(batch_size):
-                        current_state = batch[i]['observation.state']
-                        # Repeat current state for n_obs_steps
-                        state_sequence = current_state.unsqueeze(0).repeat(n_obs_steps, 1)  # [n_obs_steps, state_dim]
-                        states.append(state_sequence)
-                    result['observation.state'] = torch.stack(states)  # [B, n_obs_steps, state_dim]
+                if 'action' in collated:
+                    # Actions: [B, horizon, action_dim] -> [B, action_dim, horizon]
+                    if collated['action'].ndim == 3:
+                        collated['action'] = collated['action'].transpose(1, 2)
                 
-                return result
+                return collated
             
-            dataloader = pipeline.get_dataloader(batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
+            dataloader = pipeline.get_dataloader(batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)
             optimizer = torch.optim.Adam(policy.parameters(), lr=lr)
             
             # Training loop
@@ -220,6 +214,7 @@ def collate_fn(batch):
                     batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) 
                             for k, v in batch.items()}
                     
+                    
                     loss, _ = policy.forward(batch)
                     loss.backward()
                     optimizer.step()
@@ -317,7 +312,7 @@ def main():
     # Dataset arguments
     parser.add_argument("--dataset", type=str, default="lerobot/pusht",
                        help="LeRobot dataset name (e.g., lerobot/pusht)")
-    parser.add_argument("--num_episodes", type=int, default=50,
+    parser.add_argument("--num_episodes", type=int, default=5,
                        help="Number of episodes to convert (default: 50)")
     parser.add_argument("--robodm_data_dir", type=str, default=None,
                        help="Directory containing existing RoboDM data (skips ingestion)")